# Compute token length distribution of clinical notes

Generate plots for the thesis document and some descriptive statistics

In [None]:
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import pickle
from tqdm import tqdm
import numpy as np
import torch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pltlatexify import latexify, format_axes

In [None]:
stop_words = set(stopwords.words('english'))
punctuation_less = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

def preprocess_sentence_leave_dot(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation_less and token not in stop_words]

    return ' '.join(tokens)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# BERT model, we just need the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
seq_len = None # 2000
USE_CHUNKS = True
USE_POOLER = False
USE_MEAN_POOLING = True and not USE_POOLER

print(f"Run this session with the following parameters: {USE_CHUNKS=}, {USE_POOLER=}, {USE_MEAN_POOLING=}.")

data_path = '../data/mimic3/'

datasets = ['train'] #,'val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
# model = model.to(device)
# model.eval()

tokenlens = []

with torch.no_grad():
    for dataset in datasets:
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            inputs = tokenizer(preprocess_sentence(train_data['notes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
            tokenlens.append(inputs['input_ids'].shape[1])
            
        for i in tqdm(range(len(train_data['eventsnotes']))):
            inputs = tokenizer(preprocess_sentence(train_data['eventsnotes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
            tokenlens.append(inputs['input_ids'].shape[1])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
len(tokenlens[:13181])

In [None]:
import pandas as pd

df = pd.DataFrame({"Sequence token length": tokenlens})
df_notes = pd.DataFrame({"Sequence token length": tokenlens[:13181]})
df_eventnotes = pd.DataFrame({"Sequence token length": tokenlens[13181:]})

In [None]:
df

In [None]:
df_eventnotes

In [None]:
latexify()
limitx = 10000
sns.set(rc = {'figure.figsize':(8,6)})
#sns.set(font_scale=1.2)
fig, ax = plt.subplots()
sns.histplot(data=df_notes[df_notes['Sequence token length'].between(0,limitx, inclusive='left')], x="Sequence token length", binwidth=100, alpha=0.4, kde=True)
#ax.set_xlim(0,10000)
#ax.set_xticks(range(0,10001,1000))
plt.tight_layout()
format_axes(ax)
plt.savefig("sequence_token_length_distribution_notes.pdf")

In [None]:
latexify()
limitx = 10000
sns.set(rc = {'figure.figsize':(8,6)})
#sns.set(font_scale=1.2)
fig, ax = plt.subplots()
sns.histplot(data=df_eventnotes[df_eventnotes['Sequence token length'].between(0,limitx, inclusive='left')], x="Sequence token length", binwidth=100, alpha=0.4, kde=True)
#ax.set_xlim(0,10000)
#ax.set_xticks(range(0,10001,1000))
plt.tight_layout()
format_axes(ax)
plt.savefig("sequence_token_length_distribution_eventnotes.pdf")

In [None]:
latexify(fig_width=12, fig_height=8, font_size=10, label_size=16, title_size=24, legend_size=11)
limitx = 10000
sns.set(rc = {'figure.figsize':(12,8)})
sns.set(font_scale=1.6)
sns.set_style("ticks")

fig, ax = plt.subplots()
histplt = sns.histplot(data=df[df['Sequence token length'].between(0,limitx, inclusive='left')],
                       x="Sequence token length", binwidth=100, alpha=0.4, kde=True)

histplt.set(title='Distribution of Token Sequence Lengths of Clinical Notes from MIMIC-III.')
histplt.set_xlabel("Sequence Token Length (BERT-Tokenizer).\n Maximum limits for typical pre-trained transformer models are 512 or 1280 tokens.", fontsize = 20)

# Sequence Token Length (BERT-Tokenizer).
#ax.set_xlim(0,10000)
#ax.set_xticks(range(0,10001,1000))


plt.tight_layout()
plt.axvline(512, 0, 1.0, linewidth=1, color='r', linestyle='--')
plt.axvline(1280, 0, 1.0, linewidth=1, color='r', linestyle='--')

xt = ax.get_xticks() 
xt = xt[1:-1]
xt = np.append(xt,512)
xt = np.append(xt,1280)
xtl=xt.tolist()
xtl[-2]="512"
xtl[-1]="1280"

ax.set_xticks(xt)
ax.set_xticklabels(xtl)
[t.set_color(i) for (i,t) in  zip(['k']*(len(xtl)-2) + ['r']*2,ax.xaxis.get_ticklabels())]

format_axes(ax)
plt.savefig("sequence_token_length_distribution.pdf")

In [None]:
for (i,t) in zip(['k']*6 + ['r'],ax.xaxis.get_ticklabels()):
    print(i,t)

In [None]:
limitx = 512
latexify()
sns.set(rc = {'figure.figsize':(8,6)})
#sns.set(font_scale=1.2)
fig, ax = plt.subplots()
sns.histplot(data=df[df['Sequence token length'].between(0,limitx, inclusive='left')], x="Sequence token length", binwidth=10, alpha=0.4, kde=True)
plt.tight_layout()
format_axes(ax)
plt.savefig(f"sequence_token_length_distribution_{limitx}.pdf")

In [None]:
limitx = 89377
latexify()
sns.set(rc = {'figure.figsize':(8,6)})
#sns.set(font_scale=1.2)
fig, ax = plt.subplots()
sns.histplot(data=df[df['Sequence token length'].between(512,limitx, inclusive='left')], x="Sequence token length", binwidth=500, alpha=0.4, kde=True)
plt.tight_layout()
format_axes(ax)
plt.savefig(f"sequence_token_length_distribution_512-{limitx}.pdf")

In [None]:
df.describe()

In [None]:
# df[df<51].count()
df['Sequence token length'].between(0,512, inclusive='left').sum()/len(df)*100

In [None]:
# df[df<51].count()
df['Sequence token length'].between(0,1280, inclusive='left').sum()/len(df)*100

In [None]:
# df[df<51].count()
df['Sequence token length'].between(1280,100000, inclusive='left').sum()/len(df)*100

In [None]:
# df[df<51].count()
df['Sequence token length'].between(512,900000, inclusive='left').sum()/len(df)*100

In [None]:
df.describe()

In [None]:
df.median()