# 1.Load data

In [96]:
import pandas as pd
import imblearn
import numpy as np

In [97]:
data = pd.read_csv('./OUTPUT/dataset.csv', encoding= 'unicode_escape')
data

Unnamed: 0,Sentence,Word,Tag
0,Sentence: 1,RECORD,0
1,Sentence: 2,OC,0
2,,AM,0
3,,gallstone,0
4,,pancreatitis,0
...,...,...,...
949802,,M.D.,0
949803,Sentence: 132094,END,0
949804,,OF,0
949805,,DISCHARGE,0


## 2. Extract mappings required for the neural network
To train a neural network, we will use two mappings as given below. The neural network will only take integers as input. So lets convert all the unique tokens in the corpus to its respective index.
- {token} to {token id}: address the row in embeddings matrix for the current token.
- {tag} to {tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.

In [180]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [181]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx) 

In [182]:
data

Unnamed: 0,Sentence,Word,Tag,Word_idx,Tag_idx
0,Sentence: 1,RECORD,0,10459,0
1,Sentence: 2,OC,0,11869,0
2,,AM,0,40483,0
3,,gallstone,0,12531,0
4,,pancreatitis,0,15851,0
...,...,...,...,...,...
949802,,M.D.,0,12476,0
949803,Sentence: 132094,END,0,28105,0
949804,,OF,0,11294,0
949805,,DISCHARGE,0,39804,0


## Oversampling minority class

In [101]:
from collections import Counter

counter = Counter(data['Tag_idx'])
token = data['Word_idx'].to_numpy()
tag = data['Tag_idx'].to_numpy()
counter

Counter({0: 925942, 1: 23865})

In [102]:
from imblearn.over_sampling import SMOTE
token = token.reshape(-1,1)
tag = tag.reshape(-1,1)

In [103]:
oversample = SMOTE()
token, tag = oversample.fit_resample(token,tag)

In [104]:
counter = Counter(tag)
counter

Counter({0: 925942, 1: 925942})

In [105]:
token = pd.Series(map(lambda x: x[0], token))
tag = pd.Series(tag)

In [112]:
data1 = pd.DataFrame()
data1['Token_idx'] = token
data1['Tag_idx'] = tag

In [113]:
data1

Unnamed: 0,Token_idx,Tag_idx
0,10459,0
1,11869,0
2,40483,0
3,12531,0
4,15851,0
...,...,...
1851879,2837,1
1851880,1503,1
1851881,31116,1
1851882,35316,1


In [95]:
data1.to_csv('./OUTPUT/smote.csv')

## 3. Transform columns to extract sequential data
Next, lets fill NaN in 'sentence #' column using method ffill in fillna. Thereafter groupby on the sentence column to get a list of tokens and tags for each sentence.

In [133]:
group = []
number = 0
count = 1
for tag in data1['Tag_idx']:
    if number == 466:
        number = 0
        count = count + 1
    group.append(count)
    number = number + 1

In [134]:
data1['Group'] = pd.Series(group)

In [135]:
data_group = data1.groupby(['Group'],as_index=False
                                )['Token_idx', 'Tag_idx'].agg(lambda x: list(x))
data_group

  data_group = data1.groupby(['Group'],as_index=False


Unnamed: 0,Group,Token_idx,Tag_idx
0,1,"[10459, 11869, 40483, 12531, 15851, 13736, 301...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,"[5882, 17540, 23396, 26105, 37648, 29557, 8403...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,"[30253, 1124, 27100, 14777, 5047, 6123, 35723,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,"[23283, 18962, 18200, 20313, 28063, 23283, 386...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"[15719, 7797, 7581, 5146, 11443, 36501, 26215,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
3969,3970,"[12864, 32872, 26132, 34592, 11678, 17706, 347...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3970,3971,"[20400, 20076, 20400, 20400, 9774, 12818, 2007...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3971,3972,"[25838, 27896, 22641, 12818, 8689, 33273, 1770...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3972,3973,"[8199, 13720, 29428, 2361, 8199, 39373, 15275,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## 4. Pad sequences and split the dataset into train, test
Padding: The LSTM layers accept sequences of same length only. Therefore we will want to transform our list of token_sequences ('Word_idx') which is lists of integers into a matrix of shape (token_sequences, max_len). We can use any length as max_len. In this project we will be using length of the longest sequence as max_len. The sequences that are shorter than max_len are padded with a specified value at the end.
Import required packages

In [139]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [140]:
def get_padTokens_padTags(data_group, data1):
    tokens = data_group['Token_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= 0)
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= 0)

    return pad_tokens, pad_tags

pad_tokens, pad_tags = get_padTokens_padTags(data_group, data1)

In [144]:
print('length of tokens ' + str(len(pad_tokens)))
print('length of tags ' + str(len(pad_tags)))

length of tokens 3974
length of tags 3974


## StratifiedKFold Cross Validation

In [158]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from keras import backend as K
import keras as keras
from keras.datasets import mnist
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from keras import metrics
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

In [159]:
from numpy.random import seed
seed(1)
tf.random.set_seed(2)

In [163]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 32
input_length = max([len(s) for s in data_group['Token_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  42057 
output_dim:  32 
input_length:  466 
n_tags:  2


In [164]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [167]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    
    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(1, activation="sigmoid")))
    
    #Optimiser 
    adam = tf.keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
    
    # Compile model
    model.compile(loss='binary_crossentropy', sample_weight_mode="temporal", optimizer='adam', metrics=['acc', precision_m, recall_m, f1_m])
    model.summary()
    
    return model

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

skf = StratifiedKFold(n_splits = 5, shuffle = True)

fold_number = 1
f1_per_fold = []
recall_per_fold = []
precision_per_fold = []
acc_per_fold = []
loss_per_fold = []

for i, (train_index, val_index) in enumerate(skf.split(pad_tokens, np.sum(pad_tags, axis = 1))):
    
    print("Training on fold " + str(i+1) + "/5..........")
    
    #Split training set and validation set
    x_train, x_val = pad_tokens[train_index], pad_tokens[val_index]
    y_train, y_val = pad_tags[train_index], pad_tags[val_index]
    
    #Getting Model Architecture
    model = get_bilstm_lstm_model()
    
    #Running Model
    history = model.fit(x_train, y_train, batch_size=128, verbose=1, epochs=20)
    
    #Evaluate model
    scores = model.evaluate(x_val, y_val)
    print(f'Score for fold {fold_number}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}; {model.metrics_names[2]} of {scores[2]}; {model.metrics_names[3]} of {scores[3]}; {model.metrics_names[4]} of {scores[4]} %')
    f1_per_fold.append(scores[4])
    recall_per_fold.append(scores[3])
    precision_per_fold.append(scores[2])
    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])
    
    #Increase fold number
    fold_number = fold_number + 1

In [None]:
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print("-----------")
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - Precision: {precision_per_fold[i]} - Recall: {recall_per_fold[i]} - F1: {f1_per_fold[i]}%')
print('------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Precision: {np.mean(precision_per_fold)} (+- {np.std(precision_per_fold)})')
print(f'> Recall: {np.mean(recall_per_fold)} (+- {np.std(recall_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------')