In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Activation, TimeDistributed, BatchNormalization, Conv1D, MaxPooling1D, Flatten, Bidirectional
from keras.layers.merge import concatenate
from keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.dataprep import get_vectors, get_data

import tensorflow as tf
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings('ignore')

In [2]:
# To calculate on GPU
# https://github.com/tensorflow/tensorflow/issues/33721
TF_FORCE_GPU_ALLOW_GROWTH=1
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
# CONSTANTS
seed = 42
X_train, y_train, X_test, y_test, X_val, y_val, vocab_size, emdedding_size, vectors = get_data()
train_max = X_train.max()
test_max = X_test.max()
val_max = X_val.max()
max_all = max([train_max, test_max, val_max])
input_dim = max_all+1
output_dim = 32
# check if the data is evenly split
import collections
print('Train labels: ', collections.Counter(y_train))
print('Test labels: ', collections.Counter(y_test))
print('Val labels: ', collections.Counter(y_val))

Train labels:  Counter({1: 3056, 0: 3055})
Test labels:  Counter({1: 873, 0: 873})
Val labels:  Counter({0: 437, 1: 436})


In [3]:
# Early stopping setup
early_stopping = EarlyStopping(monitor='val_loss', patience=4)

In [4]:
def plot_history(history_arrs, train, val, xlabel, ylabel, plot_title):
    if len(history_arrs) == 1:
        history = history_arrs[0]
        plt.plot(history['accuracy'])
        plt.plot(history['val_accuracy'])
        plt.title(plot_title)
        plt.ylabel(ylabel)
        plt.xlabel(xlabel)
        plt.legend(['Train', 'Validation'], loc='upper left')
        plt.show()
    else:
        f, a = plt.subplots(1, len(history_arrs), figsize=(10,5))
        for idx, history in enumerate(history_arrs):
            # For Sine Function
            a[idx].plot(history[train])
            a[idx].plot(history[val])
            title = plot_title + ' ' + str(idx)
            a[idx].set_title(title)
            a[idx].set_xlabel(xlabel)
            a[idx].set_ylabel(ylabel)
            a[idx].legend(['Train', 'Validation'], loc='upper left')
        f.tight_layout()
        plt.show()

def calculate_metrics(model, X_test, y_test):
    ypred_class = model.predict_classes(X_test, verbose=0)
    ypred_class = ypred_class[:, 0]
    accuracy = accuracy_score(y_test, ypred_class)
    precision = precision_score(y_test, ypred_class)
    recall = recall_score(y_test, ypred_class)
    f1 = f1_score(y_test, ypred_class)
    conf_matrix = confusion_matrix(y_test, ypred_class)
    return accuracy, precision, recall, f1, conf_matrix

def print_conf_matrix(conf_matrix):
    cm = pd.DataFrame(
    conf_matrix, 
    index=['true:positive', 'true:negative'], 
    columns=['pred:positive', 'pred:negative']
    )
    print(cm)

In [5]:
def model_lstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(128))
    model.add(Dense(32))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=adamOptimizer, metrics = ['accuracy'])
    return model

In [6]:
def model_bilstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=adamOptimizer, metrics = ['accuracy'])
    return model

In [20]:
def model_cnnlstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(128, 2))
    model.add(Conv1D(64, 2))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [22]:
def model_cnnbilstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(128, 2))
    model.add(Conv1D(64, 2))
    model.add(MaxPooling1D(2))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [9]:
#Gridsearch parameters
epochs = 20
lr = [0.1, 0.01, 0.001, 0.0001]
batch = [16, 32, 64]

#### Girdsearch for LSTM model

In [10]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, trainable = True)
        model = model_lstm(embedding, lrate)
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.7119129300117493
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.6958763003349304
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.5
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.8613975048065186
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.8711340427398682
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8722795248031616
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8688430786132812
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8642611503601074
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.876288652420044
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.882588803768158
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8774341344833374
Running test for learning rate:  0.0001 , bat

#### Gridsearch for BiLSTM model

In [12]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, trainable = True)
        model = model_bilstm(embedding, lrate)
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.5
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.7623138427734375
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.7972508668899536
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.8705613017082214
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.869415819644928
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8722795248031616
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8310424089431763
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8676975965499878
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8797250986099243
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.8877434134483337
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8865979313850403
Running test for learning rate:  0.0001 , ba

#### Gridsearch for CNN-LSTM model

In [21]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, trainable = True)
        model = model_cnnlstm(embedding, lrate)
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.5383734107017517
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.5034364461898804
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.5234822630882263
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.6443299055099487
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.7754868268966675
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8459335565567017
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8373425006866455
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8608247637748718
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8436425924301147
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.8676975965499878
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8659793734550476
Running test for learning ra

#### Gridsearch for CNN-BiLSTM model

In [23]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, trainable = True)
        model = model_cnnbilstm(embedding, lrate)
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.5171821117401123
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.5229095220565796
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.4977090358734131
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.6706758141517639
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.699312686920166
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8344787955284119
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8522336483001709
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.832187831401825
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8442153334617615
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.863115668296814
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8556700944900513
Running test for learning rate: