# 1.Load data

In [1]:
import pandas as pd
import imblearn
import numpy as np

In [2]:
data = pd.read_csv('./OUTPUT/dataset.csv', encoding= 'unicode_escape')
data

Unnamed: 0,Sentence,Word,Tag
0,Sentence: 1,RECORD,0
1,Sentence: 2,OC,0
2,,AM,0
3,,gallstone,0
4,,pancreatitis,0
...,...,...,...
949802,,M.D.,0
949803,Sentence: 132094,END,0
949804,,OF,0
949805,,DISCHARGE,0


## 2. Extract mappings required for the neural network
To train a neural network, we will use two mappings as given below. The neural network will only take integers as input. So lets convert all the unique tokens in the corpus to its respective index.
- {token} to {token id}: address the row in embeddings matrix for the current token.
- {tag} to {tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.

In [3]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [4]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx) 

In [5]:
data

Unnamed: 0,Sentence,Word,Tag,Word_idx,Tag_idx
0,Sentence: 1,RECORD,0,3322,0
1,Sentence: 2,OC,0,23718,0
2,,AM,0,11324,0
3,,gallstone,0,327,0
4,,pancreatitis,0,11296,0
...,...,...,...,...,...
949802,,M.D.,0,14432,0
949803,Sentence: 132094,END,0,32614,0
949804,,OF,0,40271,0
949805,,DISCHARGE,0,6213,0


In [7]:
data.to_csv('./OUTPUT/view.csv')

## 3. Transform columns to extract sequential data
Next, lets fill NaN in 'sentence #' column using method ffill in fillna. Thereafter groupby on the sentence column to get a list of tokens and tags for each sentence.

In [6]:
data_fillna = data.fillna(method='ffill', axis=0)
data_group = data_fillna.groupby(['Sentence'],as_index=False
                                )['Word', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

#data_fillna
data_group

  data_group = data_fillna.groupby(['Sentence'],as_index=False


Unnamed: 0,Sentence,Word,Tag,Word_idx,Tag_idx
0,Sentence: 1,[RECORD],[0],[3322],[0]
1,Sentence: 10,"[WILL, D/C, ORDER, BE, USED, AS, THE, D/C, SUM...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[33533, 5502, 31556, 3833, 24497, 31037, 5537,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Sentence: 100,"[prandial, N/V/severe, upper, abdominal, pain....","[0, 1, 0, 1, 1, 0, 0, 0, 0]","[34043, 14095, 8237, 13007, 38468, 1729, 11672...","[0, 1, 0, 1, 1, 0, 0, 0, 0]"
3,Sentence: 1000,"[normal, limits., Cardiac, catheterization, da...","[0, 0, 0, 0, 0, 0, 0, 0]","[9810, 13280, 15881, 34130, 18661, 2125, 33378...","[0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[year, old, Black, female, with, significant, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22990, 4600, 20380, 13125, 9608, 27507, 10529...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...
132089,Sentence: 99995,"[Height, foot, inch, and, weight, kg., Tempera...","[0, 0, 0, 0, 0, 0, 0]","[22469, 7532, 40233, 15685, 26763, 2407, 18714]","[0, 0, 0, 0, 0, 0, 0]"
132090,Sentence: 99996,"[degrees, heart, rate, and, sinus, blood, pres...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[9731, 25006, 35610, 15685, 27085, 37761, 8893...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
132091,Sentence: 99997,"[blood, pressure, left, arm, and, oxygen, satu...","[0, 0, 0, 0, 0, 0, 0]","[37761, 8893, 29464, 30408, 15685, 6508, 40516]","[0, 0, 0, 0, 0, 0, 0]"
132092,Sentence: 99998,"[No, carotid, bruits, regular, rate, and, rhyt...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[25422, 41014, 5214, 24181, 35610, 15685, 5256...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [152]:
data_group.to_csv('./OUTPUT/datagroup.csv')

## 4. Pad sequences and split the dataset into train, test
Padding: The LSTM layers accept sequences of same length only. Therefore we will want to transform our list of token_sequences ('Word_idx') which is lists of integers into a matrix of shape (token_sequences, max_len). We can use any length as max_len. In this project we will be using length of the longest sequence as max_len. The sequences that are shorter than max_len are padded with a specified value at the end.
Import required packages

In [7]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [8]:
def get_padTokens_padTags(data_group, data):
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= 0)
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= 0)

    return pad_tokens, pad_tags

pad_tokens, pad_tags = get_padTokens_padTags(data_group, data)

In [9]:
print('length of tokens ' + str(len(pad_tokens)))
print('length of tags ' + str(len(pad_tags)))

length of tokens 132094
length of tags 132094


## StratifiedKFold Cross Validation

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from keras import backend as K
import keras as keras
from keras.datasets import mnist
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from keras import metrics
from sklearn.utils import class_weight

In [11]:
from numpy.random import seed
seed(1)
tf.random.set_seed(2)

In [12]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 32
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  42057 
output_dim:  32 
input_length:  49 
n_tags:  2


In [13]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [14]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    
    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(1, activation="sigmoid")))
    
    #Optimiser 
    adam = tf.keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
    
    # Compile model
    model.compile(loss='binary_crossentropy', sample_weight_mode="temporal", optimizer='adam', metrics=['acc', precision_m, recall_m, f1_m])
    model.summary()
    
    return model

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

skf = StratifiedKFold(n_splits = 5, shuffle = True)

fold_number = 1
f1_per_fold = []
recall_per_fold = []
precision_per_fold = []
acc_per_fold = []
loss_per_fold = []

for i, (train_index, val_index) in enumerate(skf.split(pad_tokens, np.sum(pad_tags, axis = 1))):
    
    print("Training on fold " + str(i+1) + "/5..........")
    
    #Split training set and validation set
    x_train, x_val = pad_tokens[train_index], pad_tokens[val_index]
    y_train, y_val = pad_tags[train_index], pad_tags[val_index]
    
    #Oversample minority class in training set
    print(str(fold_number) + ": started oversampling")
    index = 0
    for token, tag in zip(x_train, y_train):
        if np.sum(tag) >= 1:
            token_arr = np.tile(token, 20).reshape((-1, len(token)))
            tag_arr = np.tile(tag, 20).reshape((-1, len(tag)))
            x_train = np.append(token_arr, x_train, axis=0)
            y_train = np.append(tag_arr, y_train, axis=0)
        index = index + 1
    print(str(fold_number) + ": finished assigning sample weights")
    
    #Assigning sample weights in training set
    print(str(fold_number) + ": started oversampling")
    weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(np.ravel(y_train,order='C')),
                                                 np.ravel(y_train,order='C'))
    
    train_tags2 = np.copy(y_train)
    train_tokens2 = np.copy(x_train)
    train_tags2 = train_tags2.astype(float)
    
    indexTotal = 0
    for tags in train_tags2:
        indexTags = 0
        for symptom in tags:
            if symptom == 1:
                train_tags2[indexTotal][indexTags] = float(weights[1])
            else:
                train_tags2[indexTotal][indexTags] = float(weights[0])
            indexTags = indexTags+1
        indexTotal = indexTotal + 1
   
    weights = train_tags2.reshape((-1, 49, 1))
    print(str(fold_number) + ": finished assigning sample weights")
    
    #Getting Model Architecture
    model = get_bilstm_lstm_model()
    
    #Running Model
    history = model.fit(x_train, y_train, sample_weight = weights, batch_size=128, verbose=1, epochs=20)
    
    #Evaluate model
    scores = model.evaluate(x_val, y_val, verbose = 0)
    print(f'Score for fold {fold_number}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}; {model.metrics_names[2]} of {scores[2]}; {model.metrics_names[3]} of {scores[3]}; {model.metrics_names[4]} of {scores[4]} %')
    f1_per_fold.append(scores[4])
    recall_per_fold.append(scores[3])
    precision_per_fold.append(scores[2])
    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])
    
    #Increase fold number
    fold_number = fold_number + 1



Training on fold 1/5..........
1: started oversampling
1: finished assigning sample weights
1: started oversampling




1: finished assigning sample weights
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 32)            1345824   
                                                                 
 bidirectional (Bidirectiona  (None, 49, 64)           16640     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 49, 32)            12416     
                                                                 
 time_distributed (TimeDistr  (None, 49, 1)            33        
 ibuted)                                                         
                                                                 
Total params: 1,374,913
Trainable params: 1,374,913
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2

Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 2: loss of 0.01665506139397621; acc of 0.9977908730506897; precision_m of 0.6134408116340637; recall_m of 0.6505417823791504; f1_m of 0.6020514965057373 %
Training on fold 3/5..........
3: started oversampling
3: finished assigning sample weights
3: started oversampling
3: finished assigning sample weights
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 49, 32)            1345824   
                                                                 
 bidirectional_2 (Bidirectio  (None, 49, 64)           16640     
 nal)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 49, 32)            12416     
                                                                 
 time_distrib

Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 4: loss of 0.018074942752718925; acc of 0.9976130127906799; precision_m of 0.5596749186515808; recall_m of 0.6129063367843628; f1_m of 0.5587538480758667 %
Training on fold 5/5..........
5: started oversampling
5: finished assigning sample weights
5: started oversampling




5: finished assigning sample weights
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 49, 32)            1345824   
                                                                 
 bidirectional_4 (Bidirectio  (None, 49, 64)           16640     
 nal)                                                            
                                                                 
 lstm_9 (LSTM)               (None, 49, 32)            12416     
                                                                 
 time_distributed_4 (TimeDis  (None, 49, 1)            33        
 tributed)                                                       
                                                                 
Total params: 1,374,913
Trainable params: 1,374,913
Non-trainable params: 0
_________________________________________________________________
Epoch 1

In [17]:
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print("-----------")
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - Precision: {precision_per_fold[i]} - Recall: {recall_per_fold[i]} - F1: {f1_per_fold[i]}%')
print('------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Precision: {np.mean(precision_per_fold)} (+- {np.std(precision_per_fold)})')
print(f'> Recall: {np.mean(recall_per_fold)} (+- {np.std(recall_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------')

Score per fold
-----------
> Fold 1 - Loss: 0.017037667334079742 - Accuracy: 0.9976851940155029 - Precision: 0.5922558903694153 - Recall: 0.6298075914382935 - F1: 0.5819063782691956%
-----------
> Fold 2 - Loss: 0.01665506139397621 - Accuracy: 0.9977908730506897 - Precision: 0.6134408116340637 - Recall: 0.6505417823791504 - F1: 0.6020514965057373%
-----------
> Fold 3 - Loss: 0.017834406346082687 - Accuracy: 0.9976081848144531 - Precision: 0.568251371383667 - Recall: 0.6278537511825562 - F1: 0.5686147809028625%
-----------
> Fold 4 - Loss: 0.018074942752718925 - Accuracy: 0.9976130127906799 - Precision: 0.5596749186515808 - Recall: 0.6129063367843628 - F1: 0.5587538480758667%
-----------
> Fold 5 - Loss: 0.0173063725233078 - Accuracy: 0.9976599216461182 - Precision: 0.5783995985984802 - Recall: 0.6454018950462341 - F1: 0.583759605884552%
------------
Average scores for all folds:
> Accuracy: 0.9976714372634887 (+- 6.633408816846438e-05)
> Precision: 0.5824045181274414 (+- 0.01893825063