In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string

In [2]:
# STEP 1: load Note datasets
df_notes = pd.read_csv('/scratch/kh2383/MIMIC/1.4/NOTEEVENTS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# STEP 2: Generate the cohort to pretrain on (here e.g. we only pretrian on physician and nursing notes) 
df_notes = df_notes[df_notes.CATEGORY.isin(['Respiratory ','Nursing','Nursing/Others'])]

# IMPORTANT: if you fine tune on the same dataset that you use for pretrain, you need to preclude the fine-tune test admissions 
df_test_ids = pd.read_csv('/scratch/kh2383/MechVent/data/test.csv').HADM_ID.unique()
df_notes_fold = df_notes[~df_notes.HADM_ID.isin(df_test_ids)]

In [5]:
# STEP 3: Preprocessing
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    
    # remove punctuation, digits, spaces
    #y = y.translate(str.maketrans("", "", string.punctuation))
    y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_fold = preprocessing(df_notes_fold)

In [6]:
# STEP 4: Notes to Sentences
from spacy.lang.en import English
nlp = English()  # just the language with no model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# nlp praser may not work when there is only one token. In these cases, we just remove them as note that has length 1 usually is some random stuff

def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<20:
                #a lot of abbreviation is segmented as one line. But these are all describing the previous things
                #so I attached it to the sentence before
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print('single word notes ignored')
    return text

pretrain_sent=df_notes_fold['TEXT'].apply(lambda x: toSentence(x))

single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word 

In [9]:
# STEP 5: Create Pretraining File
file=open('/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt','w')
pretrain_sent = pretrain_sent.values
for i in (range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

In [None]:
# STEP 6: Download the Implementations and Initial Checkpoint with the SentencePiece Model and Vocab 
# Github Repo:
# XLNet: git clone https://github.com/zihangdai/xlnet.git
# Model: 
# XLNet-Base: https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip

In [2]:
# STEP 7: Generate Pretraining Tensorflow TF_Records
# For Clinical XLNet
import os
os.chdir('./xlnet/')
!python data_utils.py \
    --bsz_per_host=8 \
    --num_core_per_host=1 \
    --seq_len=512 \
    --reuse_len=256 \
    --input_glob=/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt \
    --save_dir=/scratch/kh2383/MechVent/data/xlnet_tfrecord/ \
    --num_passes=5 \
    --bi_data=True \
    --sp_path=/scratch/kh2383/clibert/xlnet_cased_L-12_H-768_A-12/spiece.model \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85

I1102 19:06:04.042432 47659116077696 data_utils.py:211] Use glob: /scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt
I1102 19:06:04.043622 47659116077696 data_utils.py:212] Find 1 files: ['/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt']
I1102 19:06:04.043815 47659116077696 data_utils.py:220] Task 0 process 1 files: ['/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt']
I1102 19:06:04.164321 47659116077696 data_utils.py:89] Processing /scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet.txt
I1102 19:06:04.184380 47659116077696 data_utils.py:92] Loading line 0
I1102 19:06:16.002577 47659116077696 data_utils.py:92] Loading line 100000
I1102 19:06:27.438141 47659116077696 data_utils.py:92] Loading line 200000
I1102 19:06:38.681625 47659116077696 data_utils.py:92] Loading line 300000
I1102 19:06:49.714661 47659116077696 data_utils.py:92] Loading line 400000
I1102 19:07:00.752820 47659116077696 data_utils.py:92] Loading line 

I1102 19:26:04.784489 47659116077696 data_utils.py:450] Processing batch 26000
I1102 19:26:19.363143 47659116077696 data_utils.py:450] Processing batch 26500
I1102 19:26:33.986644 47659116077696 data_utils.py:450] Processing batch 27000
I1102 19:26:48.579244 47659116077696 data_utils.py:450] Processing batch 27500
I1102 19:27:03.190010 47659116077696 data_utils.py:450] Processing batch 28000
I1102 19:27:17.755956 47659116077696 data_utils.py:450] Processing batch 28500
I1102 19:27:32.354958 47659116077696 data_utils.py:450] Processing batch 29000
I1102 19:27:46.932151 47659116077696 data_utils.py:450] Processing batch 29500
I1102 19:28:01.520366 47659116077696 data_utils.py:450] Processing batch 30000
I1102 19:28:16.106199 47659116077696 data_utils.py:450] Processing batch 30500
I1102 19:28:30.687078 47659116077696 data_utils.py:450] Processing batch 31000
I1102 19:28:45.305592 47659116077696 data_utils.py:450] Processing batch 31500
I1102 19:28:59.898533 47659116077696 data_utils.py:4

I1102 19:51:23.170225 47659116077696 data_utils.py:450] Processing batch 78000
I1102 19:51:37.759519 47659116077696 data_utils.py:450] Processing batch 78500
I1102 19:51:52.327802 47659116077696 data_utils.py:450] Processing batch 79000
I1102 19:52:06.934043 47659116077696 data_utils.py:450] Processing batch 79500
I1102 19:52:21.534674 47659116077696 data_utils.py:450] Processing batch 80000
I1102 19:52:36.132499 47659116077696 data_utils.py:450] Processing batch 80500
I1102 19:52:50.702902 47659116077696 data_utils.py:450] Processing batch 81000
I1102 19:53:05.284194 47659116077696 data_utils.py:450] Processing batch 81500
I1102 19:53:19.899870 47659116077696 data_utils.py:450] Processing batch 82000
I1102 19:53:34.507218 47659116077696 data_utils.py:450] Processing batch 82500
I1102 19:53:49.163180 47659116077696 data_utils.py:450] Processing batch 83000
I1102 19:54:03.766204 47659116077696 data_utils.py:450] Processing batch 83500
I1102 19:54:18.361120 47659116077696 data_utils.py:4

In [None]:
!python train_gpu.py \
    --record_info_dir=/scratch/kh2383/MechVent/data/xlnet_tfrecord/tfrecords/ \
    --model_dir=/scratch/kh2383/MechVent/model/xlnet_model/ \
    --init_checkpoint=/scratch/kh2383/MechVent/model/xlnet_model/model.ckpt \
    --train_batch_size=8 \
    --seq_len=512 \
    --reuse_len=256 \
    --mem_len=384 \
    --perm_size=256 \
    --n_layer=12 \
    --d_model=768 \
    --d_embed=768 \
    --n_head=12 \
    --d_head=64 \
    --d_inner=3072 \
    --untie_r=True \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85 \
    --num_hosts=1 \
    --num_core_per_host=2 \
    --train_steps=200000 \
    --iterations=500 \
    --save_steps=5000 

I1102 20:03:23.792002 47329276351104 train_gpu.py:328] n_token 32000
I1102 20:03:23.792614 47329276351104 data_utils.py:795] Use the following tfrecord dirs: ['/scratch/kh2383/MechVent/data/xlnet_tfrecord/tfrecords/']
I1102 20:03:23.792743 47329276351104 data_utils.py:799] [0] Record glob: /scratch/kh2383/MechVent/data/xlnet_tfrecord/tfrecords/record_info-train-*.bsz-8.seqlen-512.reuse-256.bi.alpha-6.beta-1.fnp-85.json
I1102 20:03:23.793778 47329276351104 data_utils.py:809] ['/scratch/kh2383/MechVent/data/xlnet_tfrecord/tfrecords/record_info-train-0-0.bsz-8.seqlen-512.reuse-256.uncased.bi.alpha-6.beta-1.fnp-85.json']
I1102 20:03:23.793876 47329276351104 data_utils.py:812] [0] Num of record info path: 1
I1102 20:03:23.794657 47329276351104 data_utils.py:845] [Dir 0] Number of chosen batches: 98747
I1102 20:03:23.794754 47329276351104 data_utils.py:847] [Dir 0] Number of chosen files: 1
I1102 20:03:23.794821 47329276351104 data_utils.py:848] ['/scratch/kh2383/MechVent/data/xlnet_tfrecord

I1102 20:03:53.512163 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/rel_attn/o/kernel/Adam_1:0, shape = (768, 12, 64)
I1102 20:03:53.512239 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/rel_attn/LayerNorm/beta/Adam:0, shape = (768,)
I1102 20:03:53.512314 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/rel_attn/LayerNorm/beta/Adam_1:0, shape = (768,)
I1102 20:03:53.512386 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/rel_attn/LayerNorm/gamma/Adam:0, shape = (768,)
I1102 20:03:53.512457 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/rel_attn/LayerNorm/gamma/Adam_1:0, shape = (768,)
I1102 20:03:53.512529 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/ff/layer_1/kernel/Adam:0, shape = (768, 3072)
I1102 20:03:53.512602 47329276351104 model_utils.py:91]   name = model/transformer/layer_1/ff/layer_1/kernel/Adam_1:0, shape = (768, 3072)
I1102 20:03:53.512

2019-11-02 20:03:53.773087: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0xcf98410 executing computations on platform CUDA. Devices:
2019-11-02 20:03:53.773126: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2019-11-02 20:03:53.773138: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (1): Tesla P100-PCIE-16GB, Compute Capability 6.0
2019-11-02 20:03:53.800345: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2600220000 Hz
2019-11-02 20:03:53.800589: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0xd009100 executing computations on platform Host. Devices:
2019-11-02 20:03:53.800614: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
2019-11-02 20:03:53.801383: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties: 
name: Tesla P100-PCIE-16GB majo

I1103 01:07:42.963507 47329276351104 train_gpu.py:308] Model saved in path: /scratch/kh2383/MechVent/model/xlnet_model/model.ckpt
200000
20000
I1103 01:16:52.842212 47329276351104 train_gpu.py:302] [20500] | gnorm 5.85 lr 0.000090 | loss 1.14 | pplx    3.11, bpc  1.6377
I1103 01:26:03.266377 47329276351104 train_gpu.py:302] [21000] | gnorm 6.19 lr 0.000090 | loss 1.14 | pplx    3.14, bpc  1.6492
I1103 01:35:13.472800 47329276351104 train_gpu.py:302] [21500] | gnorm 6.27 lr 0.000089 | loss 1.13 | pplx    3.08, bpc  1.6250
I1103 01:44:22.371781 47329276351104 train_gpu.py:302] [22000] | gnorm 6.33 lr 0.000089 | loss 1.10 | pplx    3.01, bpc  1.5897
I1103 01:53:31.341625 47329276351104 train_gpu.py:302] [22500] | gnorm 6.36 lr 0.000089 | loss 1.07 | pplx    2.92, bpc  1.5476
I1103 02:02:39.925597 47329276351104 train_gpu.py:302] [23000] | gnorm 6.28 lr 0.000089 | loss 1.12 | pplx    3.06, bpc  1.6130
I1103 02:11:49.331837 47329276351104 train_gpu.py:302] [23500] | gnorm 5.83 lr 0.000088 |

I1103 10:08:01.925547 47329276351104 train_gpu.py:302] [49500] | gnorm 5.30 lr 0.000075 | loss 0.97 | pplx    2.64, bpc  1.3997
I1103 10:17:10.813624 47329276351104 train_gpu.py:302] [50000] | gnorm 5.69 lr 0.000075 | loss 0.90 | pplx    2.46, bpc  1.2983
I1103 10:17:14.763897 47329276351104 train_gpu.py:308] Model saved in path: /scratch/kh2383/MechVent/model/xlnet_model/model.ckpt
200000
50000
I1103 10:26:22.588086 47329276351104 train_gpu.py:302] [50500] | gnorm 6.05 lr 0.000075 | loss 0.93 | pplx    2.52, bpc  1.3356
I1103 10:35:30.125064 47329276351104 train_gpu.py:302] [51000] | gnorm 5.75 lr 0.000075 | loss 0.92 | pplx    2.52, bpc  1.3330
I1103 10:44:37.771480 47329276351104 train_gpu.py:302] [51500] | gnorm 5.18 lr 0.000074 | loss 0.94 | pplx    2.56, bpc  1.3581
I1103 10:53:45.779591 47329276351104 train_gpu.py:302] [52000] | gnorm 6.38 lr 0.000074 | loss 0.92 | pplx    2.51, bpc  1.3287
I1103 11:02:54.049780 47329276351104 train_gpu.py:302] [52500] | gnorm 6.30 lr 0.000074 |

I1103 18:58:45.751961 47329276351104 train_gpu.py:302] [78500] | gnorm 5.63 lr 0.000061 | loss 0.93 | pplx    2.52, bpc  1.3351
I1103 19:07:57.169102 47329276351104 train_gpu.py:302] [79000] | gnorm 6.15 lr 0.000061 | loss 0.94 | pplx    2.56, bpc  1.3545
I1103 19:17:08.623243 47329276351104 train_gpu.py:302] [79500] | gnorm 5.65 lr 0.000060 | loss 0.92 | pplx    2.50, bpc  1.3215
I1103 19:26:20.618197 47329276351104 train_gpu.py:302] [80000] | gnorm 5.62 lr 0.000060 | loss 0.91 | pplx    2.50, bpc  1.3192
I1103 19:26:24.568567 47329276351104 train_gpu.py:308] Model saved in path: /scratch/kh2383/MechVent/model/xlnet_model/model.ckpt
200000
80000
I1103 19:35:36.019667 47329276351104 train_gpu.py:302] [80500] | gnorm 6.06 lr 0.000060 | loss 0.93 | pplx    2.54, bpc  1.3470
I1103 19:44:46.459486 47329276351104 train_gpu.py:302] [81000] | gnorm 5.29 lr 0.000060 | loss 0.92 | pplx    2.52, bpc  1.3312
I1103 19:53:57.337118 47329276351104 train_gpu.py:302] [81500] | gnorm 5.89 lr 0.000059 |