# 1.Load data

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import tensorflow as tf
import keras
import tensorflow.keras.backend as K
from tensorflow.python.keras.backend import get_session

In [3]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10395536577486349605
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9148379955
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4671794795857291019
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:02:00.0, compute capability: 6.1"
xla_global_id: 416903419
]


In [21]:
#import tensorflow as tf
#from tensorflow.python.keras import backend as K
#config = tf.compat.v1.ConfigProto(device_count = {'GPU': 1, 'CPU' : 49} )
#sess = tf.compat.v1.Session(config=config) 
#K.set_session(sess)


In [22]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import pandas as pd
import imblearn
import numpy as np

In [4]:
data = pd.read_csv('./OUTPUT/dataset.csv', encoding= 'unicode_escape')
data

Unnamed: 0,Sentence,Word,Tag
0,Sentence: 1,RECORD,0
1,Sentence: 2,OC,0
2,,AM,0
3,,gallstone,0
4,,pancreatitis,0
...,...,...,...
949802,,M.D.,0
949803,Sentence: 132094,END,0
949804,,OF,0
949805,,DISCHARGE,0


## Groupby

In [5]:

data_fillna = data.fillna(method='ffill', axis=0)
data_group = data_fillna.groupby(['Sentence'],as_index=False
                                )['Word', 'Tag'].agg(lambda x: list(x))

#data_fillna
data_group

  data_group = data_fillna.groupby(['Sentence'],as_index=False


Unnamed: 0,Sentence,Word,Tag
0,Sentence: 1,[RECORD],[0]
1,Sentence: 10,"[WILL, D/C, ORDER, BE, USED, AS, THE, D/C, SUM...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Sentence: 100,"[prandial, N/V/severe, upper, abdominal, pain....","[0, 1, 0, 1, 1, 0, 0, 0, 0]"
3,Sentence: 1000,"[normal, limits., Cardiac, catheterization, da...","[0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[year, old, Black, female, with, significant, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...
132089,Sentence: 99995,"[Height, foot, inch, and, weight, kg., Tempera...","[0, 0, 0, 0, 0, 0, 0]"
132090,Sentence: 99996,"[degrees, heart, rate, and, sinus, blood, pres...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
132091,Sentence: 99997,"[blood, pressure, left, arm, and, oxygen, satu...","[0, 0, 0, 0, 0, 0, 0]"
132092,Sentence: 99998,"[No, carotid, bruits, regular, rate, and, rhyt...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


## Padding

In [6]:
texts = data_group['Word'].tolist()  
labels = data_group['Tag'].tolist()  

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pad_tokens = pad_sequences(sequences, maxlen=49, dtype='int32', padding='post', value= 0)
print(pad_tokens)
pad_tags = pad_sequences(labels, maxlen=49, dtype='int32', padding='post', value= 0)
train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.3, train_size=0.7, random_state=2020)
print(pad_tags)
print('Shape of data tensor:', pad_tokens.shape)
print('Shape of label tensor:', pad_tags.shape)

Found 34275 unique tokens.
[[  115     0     0 ...     0     0     0]
 [   44   145   106 ...     0     0     0]
 [ 6315 15212   259 ...     0     0     0]
 ...
 [   42    70    33 ...     0     0     0]
 [   13   421  1398 ...     0     0     0]
 [  327   561  1373 ...     0     0     0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of data tensor: (132094, 49)
Shape of label tensor: (132094, 49)


## GLOVE Embedding Layer

In [8]:
import os

embeddings_index = {}
f = open(os.path.join(r'./GLOVE', 'glove.6B.300d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [9]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [10]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=49,
                            trainable=True)

## Model

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from keras import backend as K
import keras as keras
from keras.datasets import mnist
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from keras import metrics
from sklearn.utils import class_weight
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [12]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [13]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(embedding_layer)
    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5), merge_mode = 'concat'))

    
    # Add LSTM
    model.add(LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(1, activation="sigmoid")))
    
    #Optimiser 
    adam = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
    
    # Compile model
    model.compile(loss='binary_crossentropy', sample_weight_mode="temporal", optimizer='adam', metrics=['acc', precision_m, recall_m, f1_m])
    model.summary()
    
    return model

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.utils import compute_class_weight

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 999)

fold_number = 1
f1_per_fold = []
recall_per_fold = []
precision_per_fold = []
acc_per_fold = []
loss_per_fold = []

for i, (train_index, test_index) in enumerate(skf.split(pad_tokens, np.sum(pad_tags, axis = 1))):
    
    print("Training on fold " + str(i+1) + "/10..........")
    
    #Split training set and validation set
    x_train, x_test = pad_tokens[train_index], pad_tokens[test_index]
    y_train, y_test = pad_tags[train_index], pad_tags[test_index]
    
    #Assigning sample weights in training set
    print(str(fold_number) + ": started assigning sample weights")
    weights = class_weight.compute_class_weight(
                                                class_weight ='balanced', 
                                                classes = np.unique(np.ravel(y_train,order='C')), 
                                                y = np.ravel(y_train,order='C')
                                                )
    
    train_tags2 = np.copy(y_train)
    train_tokens2 = np.copy(x_train)
    train_tags2 = train_tags2.astype(float)
    
    indexTotal = 0
    for tags in train_tags2:
        indexTags = 0
        for symptom in tags:
            if symptom == 1:
                train_tags2[indexTotal][indexTags] = float(weights[1]+10.00)
            else:
                train_tags2[indexTotal][indexTags] = float(weights[0])
            indexTags = indexTags+1
        indexTotal = indexTotal + 1
    
    print(str(fold_number) + ": finished assigning sample weights - " + str(weights[0]) + ', ' + str(weights[1] + 10.00))
    weights = train_tags2.reshape((-1, 49, 1))
    
    #Getting Model Architecture
    model = None 
    model = get_bilstm_lstm_model()
    
    #Running Model
    history = model.fit(x_train, y_train, sample_weight = weights, batch_size=64, verbose=1, epochs=20)
    
    #Evaluate model
    scores = model.evaluate(x_test, y_test, verbose = 0)
    print(f'Score for fold {fold_number}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}; {model.metrics_names[2]} of {scores[2]}; {model.metrics_names[3]} of {scores[3]}; {model.metrics_names[4]} of {scores[4]} %')
    
    print('-----------------------------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------------------------')

    f1_per_fold.append(scores[4])
    recall_per_fold.append(scores[3])
    precision_per_fold.append(scores[2])
    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])
    
    #Increase fold number
    fold_number = fold_number + 1



Training on fold 1/10..........
1: started assigning sample weights
1: finished assigning sample weights - 0.5018498085062245, 145.64912444113264
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                                 
 bidirectional (Bidirectiona  (None, 49, 64)           85248     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 49, 32)            12416     
                                                                 
 lstm_2 (LSTM)               (None, 49, 32)            8320      
                                                                 
 time_distributed (TimeDistr  (None, 49, 1)            33        
 ibuted)                                  

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 2: loss of 0.005014336202293634; acc of 0.9988277554512024; precision_m of 0.7734789252281189; recall_m of 0.7273657917976379; f1_m of 0.7322619557380676 %
-----------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------
Training on fold 3/10..........
3: started assigning sample weights
3: finished assigning sample weights - 0.5018499814433267, 145.63649063984354
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                                 
 bidirectional_2 (Bidirectio  (None, 49, 64)           85248     
 nal)                                      

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 4: loss of 0.003217004705220461; acc of 0.9992229342460632; precision_m of 0.833143413066864; recall_m of 0.7933250069618225; f1_m of 0.7981456518173218 %
-----------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------
Training on fold 5/10..........
5: started assigning sample weights
5: finished assigning sample weights - 0.5018508305046226, 145.57449730031652
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                   

6: finished assigning sample weights - 0.5018508305046226, 145.57449730031652
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                                 
 bidirectional_5 (Bidirectio  (None, 49, 64)           85248     
 nal)                                                            
                                                                 
 lstm_16 (LSTM)              (None, 49, 32)            12416     
                                                                 
 lstm_17 (LSTM)              (None, 49, 32)            8320      
                                                                 
 time_distributed_5 (TimeDis  (None, 49, 1)            33        
 tributed)                                                       
                                          

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 7: loss of 0.0021900811698287725; acc of 0.9994118809700012; precision_m of 0.8319073915481567; recall_m of 0.8100249171257019; f1_m of 0.811424195766449 %
-----------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------
Training on fold 8/10..........
8: started assigning sample weights
8: finished assigning sample weights - 0.50185065756839, 145.5871194488409
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                                 
 bidirectional_7 (Bidirectio  (None, 49, 64)           85248     
 nal)                                         

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 9: loss of 0.0016703057335689664; acc of 0.9995524883270264; precision_m of 0.8785880208015442; recall_m of 0.8651152849197388; f1_m of 0.8638412356376648 %
-----------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------
Training on fold 10/10..........
10: started assigning sample weights
10: finished assigning sample weights - 0.50185065756839, 145.5871194488409
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 300)           10282800  
                                                 

In [15]:
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print("-----------")
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - Precision: {precision_per_fold[i]} - Recall: {recall_per_fold[i]} - F1: {f1_per_fold[i]}%')
print('------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Precision: {np.mean(precision_per_fold)} (+- {np.std(precision_per_fold)})')
print(f'> Recall: {np.mean(recall_per_fold)} (+- {np.std(recall_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------')

Score per fold
-----------
> Fold 1 - Loss: 0.00838264636695385 - Accuracy: 0.9981868267059326 - Precision: 0.7102780938148499 - Recall: 0.6563717126846313 - F1: 0.6535045504570007%
-----------
> Fold 2 - Loss: 0.005014336202293634 - Accuracy: 0.9988277554512024 - Precision: 0.7734789252281189 - Recall: 0.7273657917976379 - F1: 0.7322619557380676%
-----------
> Fold 3 - Loss: 0.0034683311823755503 - Accuracy: 0.9990918040275574 - Precision: 0.784105658531189 - Recall: 0.7578145861625671 - F1: 0.7548285722732544%
-----------
> Fold 4 - Loss: 0.003217004705220461 - Accuracy: 0.9992229342460632 - Precision: 0.833143413066864 - Recall: 0.7933250069618225 - F1: 0.7981456518173218%
-----------
> Fold 5 - Loss: 0.0028099745977669954 - Accuracy: 0.9993078708648682 - Precision: 0.8411281704902649 - Recall: 0.8073114156723022 - F1: 0.8121386170387268%
-----------
> Fold 6 - Loss: 0.002574215643107891 - Accuracy: 0.9993371367454529 - Precision: 0.8413464426994324 - Recall: 0.8127831816673279 - F1

## Self Check

In [None]:
predict_tags = model.predict(test_tokens)

In [None]:
truePositive = 0 
falsePositive = 0
trueNegative = 0 
falseNegative = 0
index = 0
for tag, predTag in zip(test_tags, predict_tags):
    for symptomTag, symptomPred in zip (tag, predTag):
        if symptomPred >= 0.50 and symptomTag == 1:
            truePositive = truePositive + 1
        elif symptomPred >= 0.50 and symptomTag == 0:
            falsePositive = falsePositive + 1
        elif symptomPred < 0.50 and symptomTag == 0:
            trueNegative = trueNegative + 1
        elif symptomPred < 0.50 and symptomTag == 1:
            falseNegative = falseNegative + 1

In [None]:
print('True postitive: ' + str(truePositive))
print('False postitive: ' + str(falsePositive))
print('True negative: ' + str(trueNegative))
print('False negative: ' + str(falseNegative))

In [None]:
precision = (truePositive)/(truePositive+falsePositive)
recall = (truePositive)/(truePositive+falseNegative)

In [None]:
print('Accuracy: ' + str((truePositive+trueNegative)/(truePositive+trueNegative+falsePositive+falseNegative)))
print('Precision: ' + str((truePositive)/(truePositive+falsePositive)))
print('Recall: ' + str((truePositive)/(truePositive+falseNegative)))
print('F1: ' + str((2*precision*recall)/(precision+recall)))

In [None]:
model.evaluate(pad_tokens, pad_tags)