In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Activation, TimeDistributed, BatchNormalization, Conv1D, MaxPooling1D, Flatten, Bidirectional
from keras.layers.merge import concatenate
from keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from dataprep import get_vectors, get_data

import tensorflow as tf
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings('ignore')

In [3]:
# To calculate on GPU
# https://github.com/tensorflow/tensorflow/issues/33721
TF_FORCE_GPU_ALLOW_GROWTH=1
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
# CONSTANTS
seed = 42
cmt_pos = pd.read_csv('../../data/prepared/no_trans_stem_pos.csv')
cmt_neg = pd.read_csv('../../data/prepared/no_trans_stem_neg.csv')
X, Y, vocab_size, emdedding_size, vectors = get_data(cmt_pos, cmt_neg, 3)
input_dim = X.max()+1
output_dim = 32
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=seed, test_size=0.3, stratify=Y, shuffle=True)
# check if the data is evenly split
import collections
print('Train labels: ', collections.Counter(y_train))
print('Test labels: ', collections.Counter(y_test))

Train labels:  Counter({1: 2206, 0: 2206})
Test labels:  Counter({1: 946, 0: 946})


In [4]:
# Cross validation setup
sk = StratifiedKFold(n_splits = 3, random_state = seed, shuffle = True) 
# Early stopping setup
early_stopping = EarlyStopping(monitor='val_loss', patience=4)

In [5]:
def plot_history(history_arrs, train, val, xlabel, ylabel, plot_title):
    if len(history_arrs) == 1:
        history = history_arrs[0]
        plt.plot(history['accuracy'])
        plt.plot(history['val_accuracy'])
        plt.title(plot_title)
        plt.ylabel(ylabel)
        plt.xlabel(xlabel)
        plt.legend(['Train', 'Validation'], loc='upper left')
        plt.show()
    else:
        f, a = plt.subplots(1, len(history_arrs), figsize=(10,5))
        for idx, history in enumerate(history_arrs):
            # For Sine Function
            a[idx].plot(history[train])
            a[idx].plot(history[val])
            title = plot_title + ' ' + str(idx)
            a[idx].set_title(title)
            a[idx].set_xlabel(xlabel)
            a[idx].set_ylabel(ylabel)
            a[idx].legend(['Train', 'Validation'], loc='upper left')
        f.tight_layout()
        plt.show()

def run_test(model_func, Xtrain, ytrain, X_test, y_test, embedding_layer, batch, epoch, lr):
    results = []
    histories = []
    best_model = None
    best_acc = 0
    for train_index, val_index in sk.split(X_train, y_train):
        Xt, Xv = Xtrain[train_index], Xtrain[val_index]
        yt, yv = ytrain[train_index], ytrain[val_index]
        model = model_func(embedding_layer, lr)
        history = model.fit(Xt, yt, validation_data=(Xv, yv), epochs=epoch, batch_size=batch, verbose = 0, callbacks=[early_stopping])
        histories.append(history.history)
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = batch)
        if best_acc < acc:
            best_model = model
        results.append([loss, acc])
    results = np.array(results)
    return results, histories, model

def calculate_metrics(model, X_test, y_test):
    ypred_class = model.predict_classes(X_test, verbose=0)
    ypred_class = ypred_class[:, 0]
    accuracy = accuracy_score(y_test, ypred_class)
    precision = precision_score(y_test, ypred_class)
    recall = recall_score(y_test, ypred_class)
    f1 = f1_score(y_test, ypred_class)
    conf_matrix = confusion_matrix(y_test, ypred_class)
    return accuracy, precision, recall, f1, conf_matrix

def print_conf_matrix(conf_matrix):
    cm = pd.DataFrame(
    conf_matrix, 
    index=['true:positive', 'true:negative'], 
    columns=['pred:positive', 'pred:negative']
    )
    print(cm)

In [6]:
def model_lstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [7]:
def model_bilstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [8]:
def model_cnnlstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(128, 2))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [9]:
def model_cnnbilstm(embedding_layer, lr):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(128, 2))
    model.add(MaxPooling1D(2))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=Adam(lr=lr), metrics = ['accuracy'])
    return model

In [10]:
#Gridsearch parameters
epochs = 20
lr = [0.1, 0.01, 0.001, 0.0001]
batch = [16, 32, 64]

In [12]:
Xtrain, Xval, ytrain, yval = train_test_split(X_train, y_train, random_state=seed, test_size=0.1, stratify=y_train, shuffle=True)

#### Girdsearch for LSTM model

In [16]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, input_length=X.shape[1], trainable = True)
        model = model_lstm(embedding, lrate)
        history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.7669132947921753
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.7394291758537292
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.5
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.858350932598114
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.8646934628486633
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8668076395988464
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8599365949630737
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8646934628486633
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8668076395988464
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.8736786246299744
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.876849889755249
Running test for learning rate:  0.0001 , bat

#### Gridsearch for BiLSTM model

In [17]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, input_length=X.shape[1], trainable = True)
        model = model_bilstm(embedding, lrate)
        history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.7521141767501831
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.7975687384605408
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.8224101662635803
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.8705074191093445
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.8673361539840698
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8652219772338867
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8615221977233887
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8631078004837036
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8779069781303406
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.8742071986198425
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8742071986198425
Running test for learning ra

#### Gridsearch for CNN-LSTM model

In [18]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, input_length=X.shape[1], trainable = True)
        model = model_cnnlstm(embedding, lrate)
        history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.5396406054496765
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.5702959895133972
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.645348846912384
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.854651153087616
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.8620507121086121
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8604651093482971
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8636363744735718
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.873150110244751
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8683932423591614
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.873150110244751
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8710359334945679
Running test for learning rate: 

#### Gridsearch for CNN-BiLSTM model

In [19]:
best_result = []
best_acc = 0
for lrate in lr:
    for b in batch:
        embedding = Embedding(input_dim, output_dim, input_length=X.shape[1], trainable = True)
        model = model_cnnbilstm(embedding, lrate)
        history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval), epochs=30, batch_size=b, verbose = 0, callbacks=[early_stopping])
        loss, acc = model.evaluate(X_test, y_test, verbose = 0, batch_size = b)
        print('Running test for learning rate: ', lrate, ', batch size: ', b, ', accuracy: ', acc)
        if acc > best_acc:
            best_acc = acc
            best_result = [lrate, b]
print('Best performing paramters:')
print('Learning rate: %s, batch size: %s' % (best_result[0], best_result[1]))

Running test for learning rate:  0.1 , batch size:  16 , accuracy:  0.5005285143852234
Running test for learning rate:  0.1 , batch size:  32 , accuracy:  0.5121564269065857
Running test for learning rate:  0.1 , batch size:  64 , accuracy:  0.5639534592628479
Running test for learning rate:  0.01 , batch size:  16 , accuracy:  0.8578224182128906
Running test for learning rate:  0.01 , batch size:  32 , accuracy:  0.8641648888587952
Running test for learning rate:  0.01 , batch size:  64 , accuracy:  0.8689217567443848
Running test for learning rate:  0.001 , batch size:  16 , accuracy:  0.8625792860984802
Running test for learning rate:  0.001 , batch size:  32 , accuracy:  0.8662790656089783
Running test for learning rate:  0.001 , batch size:  64 , accuracy:  0.8599365949630737
Running test for learning rate:  0.0001 , batch size:  16 , accuracy:  0.8689217567443848
Running test for learning rate:  0.0001 , batch size:  32 , accuracy:  0.8736786246299744
Running test for learning ra