# Compute token length distribution of clinical notes

In [None]:
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import pickle
from tqdm import tqdm
import numpy as np
import torch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
stop_words = set(stopwords.words('english'))
punctuation_less = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

def preprocess_sentence_leave_dot(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation_less and token not in stop_words]

    return ' '.join(tokens)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# BERT model, we just need the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
seq_len = None # 2000
USE_CHUNKS = True
USE_POOLER = False
USE_MEAN_POOLING = True and not USE_POOLER

print(f"Run this session with the following parameters: {USE_CHUNKS=}, {USE_POOLER=}, {USE_MEAN_POOLING=}.")

data_path = '../data/mimic3/'

datasets = ['train'] #,'val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
# model = model.to(device)
# model.eval()

tokenlens = []

with torch.no_grad():
    for dataset in datasets:
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            inputs = tokenizer(preprocess_sentence(train_data['notes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
            tokenlens.append(inputs['input_ids'].shape[1])
            
        for i in tqdm(range(len(train_data['eventsnotes']))):
            inputs = tokenizer(preprocess_sentence(train_data['eventsnotes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
            tokenlens.append(inputs['input_ids'].shape[1])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd

df = pd.DataFrame({"Sequence token length": tokenlens})

In [None]:
sns.set(rc = {'figure.figsize':(20,12)})
sns.set(font_scale=1.2)

fig, ax = plt.subplots()
sns.histplot(data=df, x="Sequence token length", binwidth=10, alpha=0.4, kde=True)
#sns.kdeplot(data=df, x="Sequence token length", bw_adjust=2)
ax.set_xlim(0,10000)
ax.set_xticks(range(0,10001,500))
plt.show()


In [None]:
df.describe()

In [None]:
# df[df<51].count()
df['Sequence token length'].between(0,51, inclusive='left').sum()

In [None]:
df.describe()

In [None]:
df.median()