In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd '/content/gdrive/My Drive/Colab Notebooks/DL4H Project' 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string

In [None]:
df_notes_fold = pd.read_csv('./data/good_datasets/fold1/discharge/train.csv')

In [None]:
# STEP 3: Preprocessing
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    
    # remove, digits, spaces
    y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_fold = preprocessing(df_notes_fold)

In [None]:
# STEP 4: Notes to Sentences
from spacy.lang.en import English
nlp = English()  # just the language with no model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# nlp praser may not work when there is only one token. In these cases, we just remove them as note that has length 1 usually is some random stuff

def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<20:
                #a lot of abbreviation is segmented as one line. But these are all describing the previous things
                #so I attached it to the sentence before
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print(doc)
    return text

pretrain_sent=df_notes_fold['TEXT'].apply(lambda x: toSentence(x))

In [None]:
# STEP 5: Create Pretraining File
file=open('PRETRAIN_DATA_PATH/clinical_sentences_pretrain_fold11.txt','w')
pretrain_sent = pretrain_sent.values
for i in tqdm(range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

In [None]:
 # STEP 7: Generate Pretraining Tensorflow TF_Records

# Generate datasets for 128 max seq
 %tensorflow_version 1.x
!python create_pretraining_data.py \
  --input_file=PRETRAIN_DATA_PATH/clinical_sentences_pretrain_fold11.txt \
  --output_file=PRETRAIN_DATA_PATH/tf_examples_128_fold11.tfrecord \
  --vocab_file=INITIAL_MODEL_PATH/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=3

In [None]:
# Generate datasets for 512 max seq
!python create_pretraining_data.py \
  --input_file=PRETRAIN_DATA_PATH/clinical_sentences_pretrain_fold11.txt \
  --output_file=PRETRAIN_DATA_PATH/tf_examples_512_fold11.tfrecord \
  --vocab_file=INITIAL_MODEL_PATH/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=512 \
  --max_predictions_per_seq=76 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=3
