In [None]:
import tensorflow as tf
import pandas as pd

import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
url='https://raw.githubusercontent.com/jacob-hansen/NLP_in_EHR_2022/910d9f0fcfeab083dff53ea2e2969c175cc816a0/train.csv'
train_df = pd.read_csv(url)
# train_df = pd.read_csv('data/train_even.csv')

In [None]:
import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words

In [None]:
def custom_tokenizer(sentence):
    # lower case
    sentence = sentence.lower()
    # split by the label/sentence separator
    sents = sentence.split('. val is ')
    # seperate out the label and the next sentence
    # print(sents)
    new_sents = [sents[0][13:],]
    for sent in sents[1:]:
        new_sents.extend(sents[1].split('. ', 1))
    # print(sents)
    # sents = [sents[0]] + [i for i in sents[1].split('. ')]
    # remove any trailing whitespace
    sents = [i.strip() for i in new_sents]
    
    # remove stop words in every 1,3,5... sentence
    # and apply tokenization
    for i in range(0, len(sents), 2):
        # remove stop words
        sents[i] = [word for word in word_tokenize(sents[i]) if word not in sw_spacy]
    # remove last . from the last sentence
    sents[-1] = sents[-1][:-1]
    
    # flatten sents 
    return_sents = []
    for i in range(len(sents)):
        if i % 2 == 0:
            return_sents.extend(sents[i])
        else:
            return_sents.append(sents[i])
    # repeat the label 3 times (loc = 1,3,5..) for every sentence 
    final_sents = []
    for i in range(0, len(return_sents)):
        if i % 2 == 0:
            final_sents.append(return_sents[i])
        else:
            final_sents.append(return_sents[i])
            final_sents.append(return_sents[i])
            final_sents.append(return_sents[i])

    return return_sents


In [None]:
# apply the custom tokenizer to the dataframe
train_df['tokenized'] = train_df['X_train'].apply(custom_tokenizer)

In [None]:
train_df.iloc[90, 0].lower()

"feature is indicator for prior disabilities that the patient had. mark 1 if any disabilities were noted medically.. val is -0.248.  feature is how long a person is in an unresponsive state after an injury. measured in hours.. val is 0.579.  feature is the amount of time between heart beats measured by ms between heartbeats. . val is 0.33.  feature is concentration of the tau protein in cerebrospinal fluid. measured in pg/ml\n. val is -1.001.  feature is a patient's indication 1-10 of their day to day happiness and satisfaction.. val is -0.535.  feature is how body fat is distributed and how much of someone's body is body fat measured by bmi. range [1-3] where each number is associated to the obesity class that based on which range the bmi falls under. val is -0.191.  feature is how long, in hours, that have passed since the initial injury.. val is -0.176.  feature is how severe the patient's acne appears. 1-10 for the number of pimples or rashes across the face and chest.. val is -0.1

In [None]:
custom_tokenizer(train_df.iloc[90, 0])

['dicator',
 'prior',
 'disabilities',
 'patient',
 '.',
 'mark',
 '1',
 'disabilities',
 'noted',
 'medically',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',
 'injury',
 '.',
 'measured',
 'hours',
 '.',
 '-0.248',
 'feature',
 'long',
 'person',
 'unresponsive',
 'state',

In [None]:
# rename columns to X_train, y_train, and tokenized
# train_df = train_df.rename(columns={'0': 'X_train', '1': 'y_train', 'tokenized': 'tokenized'})
# train_df.head()

## Create Attention Layer

In [None]:
# Attention Mechanism
from keras.layers import Layer
import keras.backend as K

class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

# Training RNN

In [None]:
# import packages for training RNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
import numpy as np

In [None]:
target = train_df['y_train'].values
target = to_categorical(target)
# convert train_df['tokenized'] to a tensor
# and pad the sequences to be the same length
max_len = 200
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['tokenized'].values)
X = tokenizer.texts_to_sequences(train_df['tokenized'].values)
X = pad_sequences(X, maxlen=max_len)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2)

In [None]:
# #create RNN model 
# model = Sequential()
# model.add(Embedding(10000, 128, input_length=X.shape[1]))
# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(2, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [49]:
#create RNN model with attention
inputs = Input((X.shape[1],))
x = Embedding(10000, 128, input_length=X.shape[1])(inputs)
att_in = LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(x)
att_out=attention()(att_in)
outputs = Dense(2, activation='softmax',trainable=True)(att_out)
model = Model(inputs , outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [50]:
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 200)]             0         
                                                                 
 embedding_16 (Embedding)    (None, 200, 128)          1280000   
                                                                 
 lstm_15 (LSTM)              (None, 200, 128)          131584    
                                                                 
 attention_9 (attention)     (None, 128)               328       
                                                                 
 dense_22 (Dense)            (None, 2)                 258       
                                                                 
Total params: 1,412,170
Trainable params: 1,412,170
Non-trainable params: 0
_________________________________________________________________


In [51]:
# train model
epochs = 10
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [52]:
print(K.eval(model.optimizer.lr))

0.001


In [53]:
# test the model
score, acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = 64)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

32/32 - 3s - loss: 0.9237 - accuracy: 0.7610 - 3s/epoch - 108ms/step
score: 0.92
acc: 0.76


In [54]:
# find predictions for the test set
y_pred = model.predict(X_test)
print(y_pred)

[[8.2434094e-01 1.7565908e-01]
 [9.9999827e-01 1.6363542e-06]
 [9.9999088e-01 9.0722951e-06]
 ...
 [7.2008241e-03 9.9279916e-01]
 [9.9999684e-01 3.1401944e-06]
 [4.4706857e-01 5.5293143e-01]]


In [55]:
# calculate the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# y_pred = np.argmax(y_pred, axis=1)
# y_test = np.argmax(y_test, axis=1)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
# add labels 
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

ValueError: ignored

In [47]:
# calculate f1 score
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='macro')


ValueError: ignored

In [None]:
# calculate f1 score on random shuffle of y_pred
from sklearn.utils import shuffle
y_pred_shuffled = shuffle(y_pred)
f1_score(y_test, y_pred_shuffled, average='macro')

0.4891526900754934