# Contextual Embeddings from Clinical Notes Improves Prediction of Sepsis

## <span style = "color:blue">To Do</span>

* TF-IDF
* ~~Word2Vec~~
* AUC for ClinicalBERT better than TF-IDF
* Preprocessing of data
* Look into using XGBoost as an additional classificaiton method
* AUC and Accuracy metrics
    * ClinicalBERT only
    * ClinicalBERT + LSTM
    * TF-IDF + LSTM

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import torch
import numpy as np
import re
import tqdm
import h5py
from keras import Model
from keras.models import load_model, Sequential
from keras.layers import Dense, LSTM, Softmax, Input
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import tensor
# from torch.nn import Softmax
from transformers import AutoTokenizer, AutoModel
# from torch.nn import LSTM, Linear, Module
from torch.autograd import Variable 

In [3]:
def preprocess(x):
    y = re.sub('\\[(.*?)\\]', '', x)  # remove de-identified brackets
    y = re.sub('[0-9]+\.', '', y)  # remove 1.2. since the segmenter segments based on this
    y = re.sub('dr\.', 'doctor', y)
    y = re.sub('m\.d\.', 'md', y)
    y = re.sub('admission date:', '', y)
    y = re.sub('discharge date:', '', y)
    y = re.sub('--|__|==', '', y)
    return y

In [4]:
def preprocessing(df_less_n):
    df_less_n['TEXT'] = df_less_n['TEXT'].fillna(' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\n', ' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].str.replace('\r', ' ')
    df_less_n['TEXT'] = df_less_n['TEXT'].apply(str.strip)
    df_less_n['TEXT'] = df_less_n['TEXT'].str.lower()

    df_less_n['TEXT'] = df_less_n['TEXT'].apply(lambda x: preprocess(x))
    
    chunk_size = 140

    # to get 512 words chunks for sepsis tasks
    df_len = len(df_less_n)
    want = pd.DataFrame({'ID': [], 'TEXT': [], 'Label': []})
    for i in range(df_len):
        x = df_less_n.TEXT.iloc[i].split()
        n = int(len(x) / chunk_size)
        for j in range(n):
            want = want.append({'TEXT': ' '.join(x[j * chunk_size:(j + 1) * chunk_size]), 'Label': df_less_n.SEPSIS.iloc[i],
                                'ID': df_less_n.SUBJECT_ID.iloc[i]}, ignore_index=True)
        if len(x) % chunk_size > 10:
            want = want.append({'TEXT': ' '.join(x[-(len(x) % chunk_size):]), 'Label': df_less_n.SEPSIS.iloc[i],
                                'ID': df_less_n.HADM_IDleft.iloc[i]}, ignore_index=True)

    return want

In [5]:
clinical_notes = pd.read_csv('NOTEEVENTS.csv')
diagnoses_icd = pd.read_csv('DIAGNOSES_ICD.csv')
diagnoses_icd['SEPSIS'] = diagnoses_icd['ICD9_CODE'].apply(lambda x: True if x == '99591' else False) 
# diagnoses_icd[diagnoses_icd.ICD9_CODE == '99591']

In [6]:
clinical_notes_mod = clinical_notes.join(diagnoses_icd, lsuffix = 'left', rsuffix = 'right', on = 'SUBJECT_ID', how = 'inner')

In [7]:
new_clinical_notes = clinical_notes_mod[['SUBJECT_ID', 'HADM_IDleft', 'ROW_IDleft', 'ROW_IDright', 'CATEGORY', 'DESCRIPTION', 'TEXT',
                                        'SEPSIS', 'ICD9_CODE']]
text = new_clinical_notes.TEXT

In [8]:
df_sample = new_clinical_notes.sample(n = 200, random_state = 904)

## <span style = "color:blue">TF-IDF Implementation</span>

In [9]:
corpus = df_sample.TEXT.to_list()
X = corpus
labels = df_sample.SEPSIS.tolist()
y = labels # .tolist()
train = ['The sky is blue.','The sun is bright.']

In [10]:
# stop_words = np.array(['\\','[','(','.','*','?',')', '0', '1', '2', '3', '4',
#                         '5', '6', '7', '8', '9', '+', '.', 'dr\.', 'doctor',
#                        'm\.d\.', 'md', 'admission date:', 'discharge date:',
#                        '--','|','__','|','=='])

stop_words = np.array(['\\[(.*?)\\]', '[0-9]+\.', 'dr\.', 'doctor',
                       'm\.d\.', 'md', 'admission date:', 'discharge date:',
                       '--|__|=='])

In [11]:
vectorizer = TfidfVectorizer().fit(X)

tfidf_vector_X = vectorizer.transform(X).toarray()  #//shape - (3,6)
# tfidf_vector_Y = vectorizer.transform(y).toarray() #//shape - (3,6)
# tfidf_vector_X = tfidf_vector_X[:, :, None] #//shape - (3,6,1) 
# tfidf_vector_Y = tfidf_vector_Y[:, :, None] #//shape - (3,6,1)

# X_train, X_test, y_train, y_test = train_test_split(tfidf_vector_X, tfidf_vector_Y, test_size = 0.2, random_state = 1)

In [12]:
X_train_tfidf, X_test_tfidf = tfidf_vector_X[0:160], tfidf_vector_X[160:]

In [13]:
# Split labels into training and test sets
y_train_tfidf, y_test_tfidf = np.array(labels[0:160]), np.array(labels[160:])

In [19]:
X_train_tfidf = np.reshape(X_train_tfidf, (X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1]))
X_test_tfidf = np.reshape(X_test_tfidf, (X_test_tfidf.shape[0], 1, X_test_tfidf.shape[1]))

In [20]:
X_train_tfidf.shape

(160, 1, 6093)

In [50]:
model = Sequential()
model.add(LSTM(6093, input_shape = (None, 160)))
model.add(Softmax())
# model.add(Dense(1))

In [51]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 6093)              152422488 
                                                                 
 softmax_1 (Softmax)         (None, 6093)              0         
                                                                 
Total params: 152,422,488
Trainable params: 152,422,488
Non-trainable params: 0
_________________________________________________________________


In [52]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
model.fit(X_train_tfidf, y_train_tfidf, epochs = 3, batch_size = 1, verbose = 0)

ValueError: in user code:

    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_8" is incompatible with the layer: expected shape=(None, None, 160), found shape=(1, 1, 6093)


In [39]:
model.save('tfiddf_sepsis_model.h5')

In [40]:
tfidf_preds = model.predict(X_test_tfidf)

In [41]:
tfidf_preds.shape

(40, 6093)

In [43]:
tfidf_softmax = Softmax()

In [49]:
tf.math.reduce_sum(tfidf_softmax(tfidf_preds), axis = 1)

<tf.Tensor: shape=(40,), dtype=float32, numpy=
array([0.99999994, 0.99999994, 1.0000001 , 0.9999999 , 1.        ,
       1.        , 1.0000001 , 0.9999999 , 1.        , 1.0000001 ,
       0.99999994, 1.        , 1.        , 0.9999999 , 0.9999999 ,
       0.99999994, 0.9999999 , 1.        , 1.        , 0.99999994,
       1.        , 1.0000001 , 1.        , 0.99999994, 1.        ,
       1.        , 0.9999998 , 1.        , 1.        , 0.9999999 ,
       1.        , 1.        , 1.        , 1.        , 0.99999994,
       1.        , 0.99999994, 0.9999999 , 1.0000001 , 1.        ],
      dtype=float32)>

In [15]:
model = Sequential()
model.add(LSTM(1, input_shape = X_train_tfidf.shape))
# model.add(LSTM(units = X_train_tfidf.shape[1], input_shape = X_train_tfidf.shape, return_sequences = True))
# model.add(Softmax())

In [16]:
# model.build(input_shape = (1, 160, 6093))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1)                 24380     
                                                                 
Total params: 24,380
Trainable params: 24,380
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_train_tfidf = y_train_tfidf.reshape((-1, 1))
y_test_tfidf = y_train_tfidf.reshape((-1, 1))

In [None]:
# define model
numberOfLSTMunits = 6093

input_layer = Input(shape = (1, 6093))
hidden_state = LSTM(numberOfLSTMunits) (input_layer)
tfidf_model = Model(inputs = input_layer, outputs = hidden_state)
tfidf_model.summary()

In [None]:
tfidf_model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])
tfidf_model.fit(X_train_tfidf, y_train_tfidf, epochs = 3, shuffle = False, verbose = 0)
tfidf_model.save('sepsis_model_tfidf.h5')

In [17]:
model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X_train_tfidf, y_train_tfidf, epochs = 3, shuffle = False, verbose = 0)
model.save('sepsis_model_tfidf.h5')

ValueError: in user code:

    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\josep\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 160, 6093), found shape=(32, 6093)


In [None]:
idf_model = TfidfVectorizer(#ngram_range=(1, 1), # 3,5
#                         stop_words = stop_words,  
                        max_features = 10000)
#                         token_pattern=r"(?u)\b\w+\b",  
#                         min_df = 1,
#                         max_df = 0.9,
#                         use_idf = 1,  
#                         smooth_idf = 1, 
#                         sublinear_tf = 1)  
matrix = idf_model.fit_transform(corpus)

In [None]:
matrix.

## <span style = "color:blue">Word2Vec Implementation</span>

## <span style = "color:blue">ClinicalBERT Implementation</span>

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
conf = model.config

In [None]:
not_null_df = preprocessing(df_sample)

In [None]:
not_null_df

In [None]:
# Row and Columns indices where the value is NaN
null_rows, null_columns = np.where(pd.isnull(not_null_df))

In [None]:
null_columns

In [None]:
def BERT(inp):
    '''
    PARAMETERS:
    
    inp - Number of rows from DataFrame
    
    RETURNS:
    
    features - Feature sets corresponding to each note
    '''
    tensor_list = []
    idx_list = []
    for j in range(inp):
        output = []
#         print(f'The value of j is: {j}')
        text_df = preprocessing(df_sample[j:j + 1])
        if not text_df.TEXT.empty:
            # Pass each sentence to the ClinicalBERT model
            for i in range(len(text_df.TEXT)):
    #             print(f'*******ANALYZING TEXT*******')
#                 print(f'The length of TEXT is: {len(text_df.TEXT)}')
                encoding = tokenizer(text_df.TEXT[i], return_tensors = 'pt')
                bert_output = model(**encoding)
                last_hidden_state = bert_output['last_hidden_state']
    #             print(f'The value of last_hidden_state is: {last_hidden_state}')
#                 print(f'The dimensions of last_hidden_state is: {last_hidden_state.size()}')
                output.append(last_hidden_state)
    #             print(f'The value of output is: {output}')
#                 print(f'The length of output is: {len(output)}')
    #             print(f'The length of TEXT is: {len(text_df.TEXT)}')
            out_tensor = torch.cat(output, dim = 1)
#             out_tensor = torch.cat((torch.as_tensor(output), torch.as_tensor(df_sample.SEPSIS.iloc[j])), dim = 1)
    #         print(f'out_tensor: {out_tensor}')
#             print(f'The shape of out_tensor is: {out_tensor.size()}')
            out_tensor_mean = torch.mean(out_tensor, dim = 1)
    #         print(f'The value of out_tensor_mean is: {out_tensor_mean}')
#             print(f'The dimensions of out_tensor_mean is: {out_tensor_mean.size()}')
            tensor_list.append(out_tensor_mean)
#             print(f'The number of features in tensor_list is: {len(tensor_list)}')
            features = torch.stack(tensor_list, dim = 0)
            print(f'The shape of features is: {features.size()}')
        else:
            idx_list.append(j)
    return(features, idx_list)

In [None]:
features = BERT(df_sample.shape[0])

In [None]:
df_sample = df_sample.drop(df_sample.index[173])

In [None]:
df_sample.shape

In [None]:
labels = df_sample.SEPSIS.tolist()

In [None]:
labels = torch.as_tensor(labels)

In [None]:
# Split features into training and test sets
X_train, X_test = features[0][0:160], features[0][160:]

In [None]:
# Split labels into training and test sets
y_train, y_test = labels[0:160], labels[160:]

In [None]:
class LSTM1(Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
        super(LSTM1, self).__init__()
        self.num_classes = num_classes #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length

        self.lstm = LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm
        self.fc_1 =  Linear(hidden_size, 128) #fully connected 1
        self.fc = Linear(128, num_classes) #fully connected last layer

        self.softmax = Softmax(dim = 1)
    
    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        out = self.softmax(hn)
        out = self.fc_1(out) #first Dense
        out = self.softmax(out) #relu
        out = self.fc(out) #Final Output
        return out

In [None]:
features[0].size()

In [None]:
y_train.shape

In [None]:
# X_train = X_train.detach().numpy()
# X_train = tf.convert_to_tensor(X_train)
# y_train = y_train.detach().numpy()
# y_train = tf.convert_to_tensor(y_train)
# X_test = X_test.detach().numpy()
# X_test = tf.convert_to_tensor(X_test)
# y_test = y_test.detach().numpy()
# y_test = tf.convert_to_tensor(y_test)

## <span style = "color:blue">Define Model</span>

In [None]:
# define model
numberOfLSTMunits = 160

input = Input(shape = (1, 768))
state_h = LSTM(numberOfLSTMunits) (input)
model1 = Model(inputs = input, outputs = state_h)
model1.summary()

In [None]:
model1.compile(loss = 'mse', optimizer = 'adam')
model1.fit(X_train, y_train, epochs = 3, shuffle = False, verbose = 0)

In [None]:
model1.save('sepsis_model2.h5')

In [None]:
preds = model1.predict(X_test)

In [None]:
preds.flatten('C').shape

In [None]:
s_max = Softmax()

In [None]:
s_max(preds)

In [None]:
model = Sequential()
model.add(LSTM(1))
# model.add(Dense(1, activation='linear'))
# Look into changing dimension which Softmax is taken.
# model.add(Softmax())
model.compile(loss = 'mse', optimizer = 'adam')
model.fit(X_train, y_train, epochs = 3, shuffle = False, verbose = 0)

In [None]:
# save model
model.save('sepsis_model.h5')

In [None]:
preds = model.predict(X_train)

In [None]:
preds

In [None]:
num_epochs = 5 #1000 epochs
learning_rate = 0.001 #0.001 lr

input_size = features[0].size()[2] #number of features
hidden_size = features[0].size()[2] #number of features in hidden state
num_layers = 1 #number of stacked lstm layers

num_classes = 2 #number of output classes 

In [None]:
lstm1 = LSTM1(num_classes, input_size, hidden_size, num_layers, X_train.shape[1]) #our lstm class

In [None]:
criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate) 

In [None]:
y_train = torch.reshape(y_train, (160, 1))

In [None]:
y_train = torch.zeros((160, 1))
y_train

In [None]:
for epoch in range(num_epochs):
    outputs = lstm1.forward(X_train) #forward pass
    optimizer.zero_grad() #caluclate the gradient, manually setting to 0

    # obtain the loss function
    loss = criterion(outputs, y_train)

    loss.backward(retain_graph = True) #calculates the loss of the loss function

    optimizer.step() #improve from loss, i.e backprop
#     if epoch % 100 == 0:
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

In [None]:
train_predict = lstm1(X_test)

In [None]:
for epoch in range(num_epochs):
    outputs = lstm1(X_test) #forward pass
    optimizer.zero_grad() #caluclate the gradient, manually setting to 0

    # obtain the loss function
    loss = criterion(outputs, y_test)

    loss.backward(retain_graph = True) #calculates the loss of the loss function

    optimizer.step() #improve from loss, i.e backprop
#     if epoch % 100 == 0:
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

In [None]:
import matplotlib.pyplot as plt

data_predict = train_predict.data.numpy() #numpy conversion
dataY_plot = y_test.data.numpy()

# data_predict = mm.inverse_transform(data_predict) #reverse transformation
# dataY_plot = mm.inverse_transform(dataY_plot)
plt.figure(figsize = (10,6)) #plotting
plt.axvline(x = 40, c = 'r', linestyle = '--') #size of the training set

plt.plot(dataY_plot, label='Actual Data') #actual plot
plt.plot(data_predict[:, 1], label='Predicted Data') #predicted plot
plt.title('Time-Series Prediction')
plt.legend()
plt.show() 

In [None]:
def LSTM(features, input_dim, hidden_dim, n_layers):
    '''
    PARAMETERS:
    
    features(tensor) - Feature sets from BERT model
    input_dim(int) - number of expected features of input data
    hidden_dim(int) - number of features in hidden layer
    n_layer(int) - number of layers
    
    RETURNS:
    
    final_out(tensor) - predicted sepsis probabilities
    '''
    lstm_layer = LSTM(input_dim, hidden_dim, n_layers, batch_first = False)
    h_t = torch.zeros(1, 1, hidden_dim, dtype = torch.float32)
    c_t = torch.zeros(1, 1, hidden_dim, dtype = torch.float32)
    output, (h_t_new, c_t_new) = lstm_layer(features, (h_t, c_t))
    print(f'Output: {output.mean(dim = 2)}')
    softmax_layer = Softmax(dim = 1)
    final_out = softmax_layer(output.mean(dim = 2))
    print(f'Final Output: {final_out}')
    return(final_out)