In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string

In [None]:
# STEP 1: load Note datasets
df_notes = pd.read_csv('NOTE_DATA_PATH/NOTEEVENTS.csv')

In [None]:
# STEP 2: Generate the cohort to pretrain on (here e.g. we only pretrian on physician and nursing notes) 
df_notes = df_notes[df_notes.CATEGORY.isin(['Physician ','Nursing','Nursing/Others'])]

# IMPORTANT: if you fine tune on the same dataset that you use for pretrain, you need to preclude the fine-tune test admissions 
df_test_ids = pd.read_csv('FINETUNE_DATA_PATH/test.csv').HADM_ID.unique()
df_notes_fold = df_notes[~df_notes.HADM_ID.isin(df_test_ids)]

In [None]:
# STEP 3: Preprocessing
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    
    # remove, digits, spaces
    y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_fold = preprocessing(df_notes_fold)

In [None]:
# STEP 4: Notes to Sentences
from spacy.lang.en import English
nlp = English()  # just the language with no model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# nlp praser may not work when there is only one token. In these cases, we just remove them as note that has length 1 usually is some random stuff

def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<20:
                #a lot of abbreviation is segmented as one line. But these are all describing the previous things
                #so I attached it to the sentence before
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print(doc)
    return text

pretrain_sent=df_notes_fold['TEXT'].apply(lambda x: toSentence(x))

In [None]:
# STEP 5: Create Pretraining File
file=open('PRETRAIN_DATA_PATH/clinical_sentences_pretrain.txt','w')
pretrain_sent = pretrain_sent.values
for i in tqdm(range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

In [None]:
# STEP 6: Download the Implementations and Initial Checkpoint with the SentencePiece Model and Vocab 
# Github Repo:
# XLNet: git clone https://github.com/zihangdai/xlnet.git
# BERT: git clone https://github.com/google-research/bert.git
# Model: 
# XLNet-Base: https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip
# BERT-Base-Uncased: https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

In [None]:
# STEP 7: Generate Pretraining Tensorflow TF_Records

# For Clinical BERT
# cd to the git repo

# Generate datasets for 128 max seq
!python create_pretraining_data.py \
  --input_file=PRETRAIN_DATA_PATH/clinical_sentences_pretrain.txt \
  --output_file=PRETRAIN_DATA_PATH/tf_examples_128.tfrecord \
  --vocab_file=INITIAL_MODEL_PATH/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=3

# Generate datasets for 512 max seq
!python create_pretraining_data.py \
  --input_file=PRETRAIN_DATA_PATH/clinical_sentences_pretrain.txt \
  --output_file=PRETRAIN_DATA_PATH/tf_examples_512.tfrecord \
  --vocab_file=INITIAL_MODEL_PATH/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=512 \
  --max_predictions_per_seq=76 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=3

# For Clinical XLNet

!python data_utils.py \
    --bsz_per_host=6 \
    --num_core_per_host=1 \
    --seq_len=512 \
    --reuse_len=256 \
    --input_glob=/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt \
    --save_dir=/scratch/kh2383/MechVent/data/xlnet_tfrecord/ \
    --num_passes=5 \
    --bi_data=True \
    --sp_path=/scratch/kh2383/clibert/xlnet_cased_L-12_H-768_A-12/spiece.model \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85

In [None]:
# STEP 8: Pretraining Use Original TF implementation

# For Clinical BERT

# First pretrain 100000 steps on the max seq length of 128
!python run_pretraining.py \
  --input_file=PRETRAIN_DATA_PATH/tf_examples_128.tfrecord \
  --output_dir=PRETRAINED_MODEL_PATH/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=INITIAL_DATA_PATH/bert_config.json \
  --init_checkpoint=INITIAL_DATA_PATH/bert_model.ckpt \
  --train_batch_size=64 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=100000 \
  --num_warmup_steps=10 \
  --learning_rate=2e-5

# Then further pretrain 100000 steps on the max seq length of 512
# NOTE: the init_checkpoint should switch to the 128 pretrained model

!python run_pretraining.py \
  --input_file=PRETRAIN_DATA_PATH/tf_examples_512.tfrecord \
  --output_dir=PRETRAINED_MODEL_PATH/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=INITIAL_DATA_PATH/bert_config.json \
  --init_checkpoint=PRETRAINED_MODEL_PATH/pretraining_output_128/model.ckpt-100000 \
  --train_batch_size=16 \
  --max_seq_length=512 \
  --max_predictions_per_seq=76 \
  --num_train_steps=100000 \
  --num_warmup_steps=10 \
  --learning_rate=2e-5


# For Clinical XLNet

# Pretrain for 200000 steps

!python train_gpu.py \
    --record_info_dir=PRETRAIN_DATA_PATH/xlnet_tfrecord/tfrecords/ \
    --model_dir=PRETRAINED_MODEL_PATH/xlnet_model/ \
    --init_checkpoint=INITIAL_DATA_PATH/xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt \
    --train_batch_size=8 \
    --seq_len=512 \
    --reuse_len=256 \
    --mem_len=384 \
    --perm_size=256 \
    --n_layer=12 \
    --d_model=768 \
    --d_embed=768 \
    --n_head=12 \
    --d_head=64 \
    --d_inner=3072 \
    --untie_r=True \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85 \
    --num_hosts=1 \
    --num_core_per_host=2 \
    --train_steps=200000 \
    --iterations=500 \
    --save_steps=5000 