In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string

In [2]:
# STEP 1: load Note datasets
df_notes = pd.read_csv('/scratch/kh2383/MIMIC/1.4/NOTEEVENTS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# STEP 2: Generate the cohort to pretrain on (here e.g. we only pretrian on physician and nursing notes) 
df_notes = df_notes[df_notes.CATEGORY.isin(['Respiratory ','Nursing','Nursing/Others'])]

# IMPORTANT: if you fine tune on the same dataset that you use for pretrain, you need to preclude the fine-tune test admissions 
df_test_ids = pd.read_csv('/scratch/kh2383/MechVent/data/test.csv').HADM_ID.unique()
df_notes_fold = df_notes[~df_notes.HADM_ID.isin(df_test_ids)]

In [5]:
# STEP 3: Preprocessing
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    
    # remove punctuation, digits, spaces
    #y = y.translate(str.maketrans("", "", string.punctuation))
    y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_fold = preprocessing(df_notes_fold)

In [6]:
# STEP 4: Notes to Sentences
from spacy.lang.en import English
nlp = English()  # just the language with no model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# nlp praser may not work when there is only one token. In these cases, we just remove them as note that has length 1 usually is some random stuff

def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<20:
                #a lot of abbreviation is segmented as one line. But these are all describing the previous things
                #so I attached it to the sentence before
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print('single word notes ignored')
    return text

pretrain_sent=df_notes_fold['TEXT'].apply(lambda x: toSentence(x))

single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word notes ignored
single word 

In [9]:
# STEP 5: Create Pretraining File
file=open('/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt','w')
pretrain_sent = pretrain_sent.values
for i in (range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

In [None]:
# STEP 6: Download the Implementations and Initial Checkpoint with the SentencePiece Model and Vocab 
# Github Repo:
# XLNet: git clone https://github.com/zihangdai/xlnet.git
# Model: 
# XLNet-Base: https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip

In [2]:
# STEP 7: Generate Pretraining Tensorflow TF_Records
# For Clinical XLNet
import os
os.chdir('./xlnet/')
!python data_utils.py \
    --bsz_per_host=8 \
    --num_core_per_host=1 \
    --seq_len=512 \
    --reuse_len=256 \
    --input_glob=/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt \
    --save_dir=/scratch/kh2383/MechVent/data/xlnet_tfrecord_total/ \
    --num_passes=5 \
    --bi_data=True \
    --sp_path=/scratch/kh2383/clibert/xlnet_cased_L-12_H-768_A-12/spiece.model \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85

I1225 12:41:57.235386 47871649542784 data_utils.py:211] Use glob: /scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt
I1225 12:41:57.236729 47871649542784 data_utils.py:212] Find 1 files: ['/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt']
I1225 12:41:57.236943 47871649542784 data_utils.py:220] Task 0 process 1 files: ['/scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt']
I1225 12:41:57.405687 47871649542784 data_utils.py:89] Processing /scratch/kh2383/MechVent/data/clinical_sentences_pretrain_xlnet_total.txt
I1225 12:41:57.406338 47871649542784 data_utils.py:92] Loading line 0
I1225 12:42:09.445044 47871649542784 data_utils.py:92] Loading line 100000
I1225 12:42:20.928989 47871649542784 data_utils.py:92] Loading line 200000
I1225 12:42:32.285392 47871649542784 data_utils.py:92] Loading line 300000
I1225 12:42:43.433550 47871649542784 data_utils.py:92] Loading line 400000
I1225 12:42:54.533169 47871649542784 data_ut

I1225 13:01:41.738914 47871649542784 data_utils.py:450] Processing batch 25000
I1225 13:01:56.142193 47871649542784 data_utils.py:450] Processing batch 25500
I1225 13:02:10.607537 47871649542784 data_utils.py:450] Processing batch 26000
I1225 13:02:24.990637 47871649542784 data_utils.py:450] Processing batch 26500
I1225 13:02:39.466825 47871649542784 data_utils.py:450] Processing batch 27000
I1225 13:02:53.869610 47871649542784 data_utils.py:450] Processing batch 27500
I1225 13:03:08.297809 47871649542784 data_utils.py:450] Processing batch 28000
I1225 13:03:22.656568 47871649542784 data_utils.py:450] Processing batch 28500
I1225 13:03:37.083337 47871649542784 data_utils.py:450] Processing batch 29000
I1225 13:03:51.485851 47871649542784 data_utils.py:450] Processing batch 29500
I1225 13:04:05.858018 47871649542784 data_utils.py:450] Processing batch 30000
I1225 13:04:20.144429 47871649542784 data_utils.py:450] Processing batch 30500
I1225 13:04:34.536296 47871649542784 data_utils.py:4

I1225 13:26:39.851696 47871649542784 data_utils.py:450] Processing batch 77000
I1225 13:26:54.247768 47871649542784 data_utils.py:450] Processing batch 77500
I1225 13:27:08.631620 47871649542784 data_utils.py:450] Processing batch 78000
I1225 13:27:23.007143 47871649542784 data_utils.py:450] Processing batch 78500
I1225 13:27:37.400776 47871649542784 data_utils.py:450] Processing batch 79000
I1225 13:27:51.898472 47871649542784 data_utils.py:450] Processing batch 79500
I1225 13:28:06.299418 47871649542784 data_utils.py:450] Processing batch 80000
I1225 13:28:20.730368 47871649542784 data_utils.py:450] Processing batch 80500
I1225 13:28:35.179659 47871649542784 data_utils.py:450] Processing batch 81000
I1225 13:28:49.500759 47871649542784 data_utils.py:450] Processing batch 81500
I1225 13:29:03.852644 47871649542784 data_utils.py:450] Processing batch 82000
I1225 13:29:18.131757 47871649542784 data_utils.py:450] Processing batch 82500
I1225 13:29:32.450736 47871649542784 data_utils.py:4

In [3]:
!python train_gpu.py \
    --record_info_dir=/scratch/kh2383/MechVent/data/xlnet_tfrecord_total/tfrecords/ \
    --model_dir=/scratch/kh2383/MechVent/model/xlnet_model/ \
    --init_checkpoint=/scratch/kh2383/MechVent/model/xlnet_model/model.ckpt \
    --train_batch_size=8 \
    --seq_len=512 \
    --reuse_len=256 \
    --mem_len=384 \
    --perm_size=256 \
    --n_layer=12 \
    --d_model=768 \
    --d_embed=768 \
    --n_head=12 \
    --d_head=64 \
    --d_inner=3072 \
    --untie_r=True \
    --mask_alpha=6 \
    --mask_beta=1 \
    --num_predict=85 \
    --num_hosts=1 \
    --num_core_per_host=2 \
    --train_steps=400000 \
    --iterations=500 \
    --save_steps=5000 

I1225 13:55:12.645989 47630971126400 train_gpu.py:328] n_token 32000
I1225 13:55:12.651739 47630971126400 data_utils.py:795] Use the following tfrecord dirs: ['/scratch/kh2383/MechVent/data/xlnet_tfrecord_total/tfrecords/']
I1225 13:55:12.652017 47630971126400 data_utils.py:799] [0] Record glob: /scratch/kh2383/MechVent/data/xlnet_tfrecord_total/tfrecords/record_info-train-*.bsz-8.seqlen-512.reuse-256.bi.alpha-6.beta-1.fnp-85.json
I1225 13:55:13.006576 47630971126400 data_utils.py:809] ['/scratch/kh2383/MechVent/data/xlnet_tfrecord/tfrecords/record_info-train-0-0.bsz-8.seqlen-512.reuse-256.uncased.bi.alpha-6.beta-1.fnp-85.json']
I1225 13:55:13.006993 47630971126400 data_utils.py:812] [0] Num of record info path: 1
I1225 13:55:13.035062 47630971126400 data_utils.py:845] [Dir 0] Number of chosen batches: 98747
I1225 13:55:13.035315 47630971126400 data_utils.py:847] [Dir 0] Number of chosen files: 1
I1225 13:55:13.035436 47630971126400 data_utils.py:848] ['/scratch/kh2383/MechVent/data/xl

I1225 13:55:49.420633 47630971126400 model_utils.py:91]   name = model/transformer/layer_2/ff/layer_2/bias:0, shape = (768,), *INIT_FROM_CKPT*
I1225 13:55:49.420739 47630971126400 model_utils.py:91]   name = model/transformer/layer_2/ff/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
I1225 13:55:49.420852 47630971126400 model_utils.py:91]   name = model/transformer/layer_2/ff/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
I1225 13:55:49.420959 47630971126400 model_utils.py:91]   name = model/transformer/layer_3/rel_attn/k/kernel:0, shape = (768, 12, 64), *INIT_FROM_CKPT*
I1225 13:55:49.421071 47630971126400 model_utils.py:91]   name = model/transformer/layer_3/rel_attn/v/kernel:0, shape = (768, 12, 64), *INIT_FROM_CKPT*
I1225 13:55:49.421184 47630971126400 model_utils.py:91]   name = model/transformer/layer_3/rel_attn/r/kernel:0, shape = (768, 12, 64), *INIT_FROM_CKPT*
I1225 13:55:49.421296 47630971126400 model_utils.py:91]   name = model/transformer/layer_3/rel_attn/q/ke

2019-12-25 13:55:49.650781: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0xcb58550 executing computations on platform CUDA. Devices:
2019-12-25 13:55:49.650833: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1
2019-12-25 13:55:49.680619: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2599885000 Hz
2019-12-25 13:55:49.681072: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0xcbc0d50 executing computations on platform Host. Devices:
2019-12-25 13:55:49.681117: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
2019-12-25 13:55:49.682059: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7335
pciBusID: 0000:05:00.0
totalMemory: 7.93GiB freeMemory: 7.82GiB
2019-12-25 13:55:49.682137: I tensorflow/core/common_

2019-12-25 13:56:41.642538: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b5369000000 of size 16777216
2019-12-25 13:56:41.642550: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b536a000000 of size 9437184
2019-12-25 13:56:41.642562: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b536a900000 of size 9437184
2019-12-25 13:56:41.642573: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b536b200000 of size 14680064
2019-12-25 13:56:41.642585: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b5394000000 of size 2359296
2019-12-25 13:56:41.642597: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b5394240000 of size 2359296
2019-12-25 13:56:41.642609: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b5394480000 of size 2359296
2019-12-25 13:56:41.642621: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53946c0000 of size 2359296
2019-1

2019-12-25 13:56:51.691996: W tensorflow/core/common_runtime/bfc_allocator.cc:267] Allocator (GPU_0_bfc) ran out of memory trying to allocate 132.00MiB.  Current allocation summary follows.
2019-12-25 13:56:51.692253: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (256): 	Total Chunks: 30, Chunks in use: 28. 7.5KiB allocated for chunks. 7.0KiB in use in bin. 124B client-requested in use in bin.
2019-12-25 13:56:51.692289: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-12-25 13:56:51.692319: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (1024): 	Total Chunks: 4, Chunks in use: 4. 5.8KiB allocated for chunks. 5.8KiB in use in bin. 5.0KiB client-requested in use in bin.
2019-12-25 13:56:51.692349: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (2048): 	Total Chunks: 184, Chunks in use: 184. 551.8KiB allocated for

2019-12-25 13:56:51.714302: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fd48f400 of size 2359296
2019-12-25 13:56:51.714317: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fd6cf400 of size 2359296
2019-12-25 13:56:51.714331: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fd90f400 of size 2359296
2019-12-25 13:56:51.714345: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fdb4f400 of size 2359296
2019-12-25 13:56:51.714359: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fdd8f400 of size 2359296
2019-12-25 13:56:51.714373: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fdfcf400 of size 2359296
2019-12-25 13:56:51.714387: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fe20f400 of size 2359296
2019-12-25 13:56:51.714401: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x2b53fe44f400 of size 2359296
2019-12-

caught error
> /home/kh2383/MechVent/pretraining/xlnet/train_gpu.py(285)train()
-> try:
(Pdb) 
--KeyboardInterrupt--
(Pdb) 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/site-packages/ipython-6.2.1-py3.6.egg/IPython/utils/_process_posix.py", line 161, in system
    res_idx = child.expect_list(patterns, self.read_timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/site-packages/pexpect-4.3.1-py3.6.egg/pexpect/spawnbase.py", line 355, in expect_list
    return exp.expect_loop(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/site-packages/pexpect-4.3.1-py3.6.egg/pexpect/expect.py", line 96, in expect_loop
    incoming = spawn.read_nonblocking(spawn.maxread, timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/site-packages/pexpect-4.3.1-py3.6.egg/pexpect/pty_spawn.py", line 456, in read_nonblocking
    r, w, e = select_ignore_interrupts([self.child_fd], [], [], timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/site-packages/pexpect-4.3.1-py3.6.egg/pexpect/utils.py", line 143, in select_ignore_interrupts
    return select

TypeError: must be str, not list