# Contextual Embeddings from Clinical Notes Improves Prediction of Sepsis

In [113]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import torch
import numpy as np
import re
import tqdm
from sklearn.metrics import roc_auc_score, f1_score
from torch import tensor
from torch.nn import Softmax
from transformers import AutoTokenizer, AutoModel
from torch.nn import LSTM

In [2]:
def preprocess(x):
    y = re.sub('\\[(.*?)\\]', '', x)  # remove de-identified brackets
    y = re.sub('[0-9]+\.', '', y)  # remove 1.2. since the segmenter segments based on this
    y = re.sub('dr\.', 'doctor', y)
    y = re.sub('m\.d\.', 'md', y)
    y = re.sub('admission date:', '', y)
    y = re.sub('discharge date:', '', y)
    y = re.sub('--|__|==', '', y)
    return y

In [46]:
def preprocessing(df_less_n):
    df_less_n['TEXT'] = df_less_n['TEXT'].fillna(' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\n', ' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\r', ' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].apply(str.strip)
    df_less_n['TEXT'] = df_less_n['TEXT'].str.lower()

    df_less_n['TEXT'] = df_less_n['TEXT'].apply(lambda x: preprocess(x))
    
    chunk_size = 140

    # to get 512 words chunks for sepsis tasks
    df_len = len(df_less_n)
    want = pd.DataFrame({'ID': [], 'TEXT': [], 'Label': []})
    for i in range(df_len):
        x = df_less_n.TEXT.iloc[i].split()
        n = int(len(x) / chunk_size)
        for j in range(n):
            want = want.append({'TEXT': ' '.join(x[j * chunk_size:(j + 1) * chunk_size]), 'Label': df_less_n.SEPSIS.iloc[i],
                                'ID': df_less_n.SUBJECT_ID.iloc[i]}, ignore_index=True)
        if len(x) % chunk_size > 10:
            want = want.append({'TEXT': ' '.join(x[-(len(x) % chunk_size):]), 'Label': df_less_n.SEPSIS.iloc[i],
                                'ID': df_less_n.HADM_IDleft.iloc[i]}, ignore_index=True)

    return want

In [4]:
clinical_notes = pd.read_csv('NOTEEVENTS.csv')
diagnoses_icd = pd.read_csv('DIAGNOSES_ICD.csv')
diagnoses_icd['SEPSIS'] = diagnoses_icd['ICD9_CODE'].apply(lambda x: True if x == '99591' else False) 
diagnoses_icd[diagnoses_icd.ICD9_CODE == '99591']

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SEPSIS
276,505,64,172056,3.0,99591,True
450,679,85,112077,18.0,99591,True
1084,477,61,189535,8.0,99591,True
1097,490,62,116009,4.0,99591,True
1825,2383,191,136614,4.0,99591,True
...,...,...,...,...,...,...
647075,632154,95895,160501,10.0,99591,True
647523,633127,96145,137544,3.0,99591,True
648242,638716,97229,191765,7.0,99591,True
650106,640418,97598,148929,11.0,99591,True


In [5]:
clinical_notes_mod = clinical_notes.join(diagnoses_icd, lsuffix = 'left', rsuffix = 'right', on = 'SUBJECT_ID', how = 'inner')

In [6]:
new_clinical_notes = clinical_notes_mod[['SUBJECT_ID', 'HADM_IDleft', 'ROW_IDleft', 'ROW_IDright', 'CATEGORY', 'DESCRIPTION', 'TEXT',
                                        'SEPSIS', 'ICD9_CODE']]
text = new_clinical_notes.TEXT
new_clinical_notes[0:10]

Unnamed: 0,SUBJECT_ID,HADM_IDleft,ROW_IDleft,ROW_IDright,CATEGORY,DESCRIPTION,TEXT,SEPSIS,ICD9_CODE
0,22532,167853.0,174,38127,Discharge summary,Report,Admission Date: [**2151-7-16**] Dischar...,False,5559
245,22532,167853.0,170,38127,Discharge summary,Report,Admission Date: [**2151-7-16**] Dischar...,False,5559
60246,22532,167853.0,59795,38127,Echo,Report,PATIENT/TEST INFORMATION:\nIndication: Aortic ...,False,5559
60247,22532,167853.0,59796,38127,Echo,Report,PATIENT/TEST INFORMATION:\nIndication: Endocar...,False,5559
102792,22532,167853.0,106289,38127,ECG,Report,Atrial fibrillation with a slow ventricular re...,False,5559
102793,22532,167853.0,106290,38127,ECG,Report,Atrial fibrillation with a slow ventricular re...,False,5559
102974,22532,167853.0,106291,38127,ECG,Report,Atrial fibrillation with a slow ventricular re...,False,5559
102975,22532,167853.0,106292,38127,ECG,Report,Atrial fibrillation with a rapid ventricular r...,False,5559
756685,22532,167853.0,763295,38127,Radiology,CHEST (PORTABLE AP),[**2151-7-16**] 5:01 AM\n CHEST (PORTABLE AP) ...,False,5559
758388,22532,167853.0,764802,38127,Radiology,CT ABDOMEN W/CONTRAST,[**2151-8-4**] 3:39 PM\n CT ABDOMEN W/CONTRAST...,False,5559


In [88]:
df_sample = new_clinical_notes.sample(n = 100, random_state = 3)
df_sample

Unnamed: 0,SUBJECT_ID,HADM_IDleft,ROW_IDleft,ROW_IDright,CATEGORY,DESCRIPTION,TEXT,SEPSIS,ICD9_CODE
1257842,70393,111964.0,1253154,94121,Radiology,CHEST (PORTABLE AP),[**2172-9-30**] 3:48 AM\n CHEST (PORTABLE AP) ...,False,42832
782159,6863,,785956,5931,Radiology,L TIB/FIB (AP & LAT) LEFT,[**2170-2-17**] 9:54 PM\n TIB/FIB (AP & LAT) L...,False,4019
1595258,27736,196052.0,1625137,19955,Nursing/other,Report,CVICU NPN\nO: ROS\n\nPt was taken to the or at...,False,2851
2067539,26547,162368.0,2040091,31431,Nursing/other,Report,NPN 1500-2300 con't\n\n\n#6 CVR O: Infant note...,True,99591
1294190,2710,152429.0,1297631,1399,Nursing/other,Report,NURSING UPDATE\nCV:\n AFEBRILE. PA CATH IN S...,False,40301
...,...,...,...,...,...,...,...,...,...
1505211,18463,128430.0,1505125,31972,Nursing/other,Report,"Pt on AC 17 x 550 + 8, 50%. Ph is being partia...",False,7318
423213,76529,178352.0,422050,61841,Nursing,Nursing Progress Note,"HPI: 88F s/p AVR, CABG [**10-12**], readmit wi...",False,2182
971771,6990,115527.0,964281,6058,Radiology,RP FEMORAL VASCULAR US RIGHT PORT,[**2128-7-8**] 9:42 AM\n FEMORAL VASCULAR US R...,False,34590
1009283,22954,120339.0,1031289,34504,Radiology,PORTABLE ABDOMEN,[**2137-7-21**] 8:46 AM\n PORTABLE ABDOMEN ...,False,V1046


In [89]:
(df_sample[df_sample.SEPSIS == True])

Unnamed: 0,SUBJECT_ID,HADM_IDleft,ROW_IDleft,ROW_IDright,CATEGORY,DESCRIPTION,TEXT,SEPSIS,ICD9_CODE
2067539,26547,162368.0,2040091,31431,Nursing/other,Report,NPN 1500-2300 con't\n\n\n#6 CVR O: Infant note...,True,99591


In [8]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
conf = model.config

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [91]:
tensor_list = []
for j in range(89):
    output = []
    print(f'The value of j is: {j}')
    text_df = preprocessing(df_sample[j:j + 1])
    for i in range(len(text_df.TEXT)):
        print(f'*******ANALYZING TEXT*******')
        encoding = tokenizer(text_df.TEXT[i], return_tensors = 'pt')
        bert_output = model(**encoding)
        last_hidden_state = bert_output['last_hidden_state']
        print(f'The value of last_hidden_state is: {last_hidden_state}')
        output.append(last_hidden_state)
        print(f'The value of output is: {output}')
    out_tensor = torch.cat(output, dim = 1)
    out_tensor_mean = out_tensor.mean()
    print(f'The value of out_tensor_mean is: {out_tensor_mean}')
    tensor_list.append(out_tensor_mean)
new_tensor = torch.as_tensor(tensor_list)

The value of j is: 0
*******ANALYZING TEXT*******
The value of last_hidden_state is: tensor([[[ 2.8309e-01, -1.9076e-01, -1.5633e-01,  ..., -1.9615e-01,
          -9.5207e-03,  1.1701e-04],
         [ 1.8201e-01, -3.5765e-01, -6.4796e-02,  ...,  2.6135e-01,
           8.4063e-01,  5.3832e-01],
         [ 4.0169e-01, -1.0161e+00, -3.8747e-01,  ...,  2.0839e-02,
          -2.0262e-01, -4.2888e-01],
         ...,
         [ 5.4076e-02, -3.3269e-01,  2.1977e-01,  ..., -2.2167e-01,
          -2.7461e-01, -2.4079e-01],
         [ 1.8790e-01, -5.1146e-01, -9.1895e-02,  ...,  2.2198e-01,
           1.7985e-01, -4.4622e-01],
         [ 1.3302e+00, -6.9869e-01, -2.4012e-01,  ...,  1.1947e-02,
          -9.9954e-02,  3.4472e-01]]], grad_fn=<NativeLayerNormBackward0>)
The value of output is: [tensor([[[ 2.8309e-01, -1.9076e-01, -1.5633e-01,  ..., -1.9615e-01,
          -9.5207e-03,  1.1701e-04],
         [ 1.8201e-01, -3.5765e-01, -6.4796e-02,  ...,  2.6135e-01,
           8.4063e-01,  5.3832e-01]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_n['TEXT'] = df_less_n['TEXT'].fillna(' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\n', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\r', ' ')
A value is trying to be set on a 

The value of last_hidden_state is: tensor([[[ 0.4914, -0.1029, -0.0829,  ..., -0.2729,  0.2178, -0.1669],
         [ 0.0498, -0.3919,  0.3352,  ...,  0.6010,  1.0587,  0.1658],
         [ 0.9293, -1.1858, -0.2185,  ...,  0.1179, -0.0585, -0.6884],
         ...,
         [ 0.5938, -0.1281,  0.6650,  ..., -0.3583,  0.3442,  0.0326],
         [ 0.4150, -0.5352,  0.1030,  ...,  0.1867, -0.0361, -0.0912],
         [ 1.6309, -1.2278, -0.2070,  ..., -0.1980,  0.1433,  0.1171]]],
       grad_fn=<NativeLayerNormBackward0>)
The value of output is: [tensor([[[ 0.4914, -0.1029, -0.0829,  ..., -0.2729,  0.2178, -0.1669],
         [ 0.0498, -0.3919,  0.3352,  ...,  0.6010,  1.0587,  0.1658],
         [ 0.9293, -1.1858, -0.2185,  ...,  0.1179, -0.0585, -0.6884],
         ...,
         [ 0.5938, -0.1281,  0.6650,  ..., -0.3583,  0.3442,  0.0326],
         [ 0.4150, -0.5352,  0.1030,  ...,  0.1867, -0.0361, -0.0912],
         [ 1.6309, -1.2278, -0.2070,  ..., -0.1980,  0.1433,  0.1171]]],
       grad_f

In [92]:
new_tensor.shape[0]

89

In [93]:
a = torch.reshape(new_tensor, (1, new_tensor.shape[0]))
a

tensor([[-0.0076, -0.0077, -0.0079, -0.0078, -0.0079, -0.0078, -0.0073, -0.0083,
         -0.0079, -0.0077, -0.0077, -0.0081, -0.0077, -0.0079, -0.0076, -0.0075,
         -0.0077, -0.0079, -0.0079, -0.0078, -0.0078, -0.0082, -0.0076, -0.0077,
         -0.0086, -0.0075, -0.0078, -0.0079, -0.0079, -0.0081, -0.0077, -0.0081,
         -0.0076, -0.0083, -0.0076, -0.0081, -0.0083, -0.0074, -0.0081, -0.0077,
         -0.0083, -0.0081, -0.0077, -0.0080, -0.0080, -0.0079, -0.0077, -0.0080,
         -0.0084, -0.0077, -0.0082, -0.0080, -0.0076, -0.0084, -0.0079, -0.0079,
         -0.0078, -0.0080, -0.0080, -0.0080, -0.0083, -0.0077, -0.0077, -0.0081,
         -0.0078, -0.0081, -0.0077, -0.0078, -0.0077, -0.0082, -0.0080, -0.0079,
         -0.0079, -0.0081, -0.0079, -0.0079, -0.0078, -0.0079, -0.0081, -0.0081,
         -0.0078, -0.0081, -0.0077, -0.0078, -0.0084, -0.0078, -0.0079, -0.0077,
         -0.0078]])

In [105]:
# for i in range(len(tensor_list)):
input_dim = a.shape[1]
hidden_dim = 89
n_layers = 1
lstm_layer = LSTM(input_dim, hidden_dim, n_layers, batch_first = False)
h_t = torch.zeros(1, 89, dtype = torch.float32)
c_t = torch.zeros(1, 89, dtype = torch.float32)
output, (h_t_new, c_t_new) = lstm_layer(a, (h_t, c_t))
softmax_layer = Softmax(dim = 1)
final_out = softmax_layer(output)
predicted_labels = (final_out.flatten() > 0.5)

In [104]:
true_labels = torch.as_tensor(df_sample.SEPSIS.values)

In [108]:
accuracy = (predicted_labels == true_labels[0:89]).sum() / len(predicted_labels)

In [109]:
accuracy

tensor(0.9888)

In [112]:
roc_auc_score(true_labels[0:89], predicted_labels)

0.5