# 2. Suggestion Classification
- MSc Computer Science - Final Major Project

Instructions: Run all cells, then navigate to the bottom to view the results from the suggestion mining experiments conducted for my suggestion mining project.

Estimated run time: 45mins

In [200]:
import pickle
import pandas as pd
pd.set_option('max_colwidth',40)
pd.set_option('display.max_rows',833)

import numpy as np
import os, gc, time, warnings

from scipy import sparse
import scipy.stats as ss
from scipy.sparse import csr_matrix, hstack, vstack

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

import nltk
from nltk import pos_tag, word_tokenize
import gensim

import tensorflow as tf
import keras.backend as K
from keras import layers
from keras.models import Model, Sequential
from keras.utils import plot_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Concatenate, BatchNormalization, Reshape
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, TimeDistributed
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping
from keras import regularizers

import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Import Methods
- Importing the data for training using pickle and storing them in pandas dataframes. 

In [201]:
# Import training data
def import_train():
    train = pickle.load( open( "train_clean.pkl", "rb" ) )
    return train

# Import training data (Lemmatized, and stop words removed)
def import_train_swl():
    train = pickle.load( open( "train_clean2.pkl", "rb" ) )
    return train

# Import training data (Augmented data)
def import_train_aug(): 
    train = import_train()
    train_aug1 = pickle.load( open( "train_aug1.pkl", "rb"))
    train_stack = pd.concat([train, train_aug1], axis=0)
    return train_stack

In [202]:
# Import Dataset A
def import_dataset_A():
    train = import_train()
    val = pickle.load( open( "test_clean.pkl", "rb" ) )
    test = pickle.load( open( "eval_clean.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_corpus.pkl", "rb"))
    return train, val, test, corpus

In [203]:
# Import Dataset A (Lemmatized, and stop words removed)
def import_dataset_A_swl():
    train = import_train_swl()
    val = pickle.load( open( "test_clean2.pkl", "rb" ) )
    test = pickle.load( open( "eval_clean2.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_corpus.pkl", "rb"))
    return train, val, test, corpus

In [204]:
# Import Dataset A (Augmented training data)
def import_dataset_A_aug():
    train = import_train_aug()
    val = pickle.load( open( "test_clean.pkl", "rb" ) )
    test = pickle.load( open( "eval_clean.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_corpus.pkl", "rb"))
    return train, val, test, corpus

In [205]:
# Import Dataset B
def import_dataset_B():
    train = import_train()
    val = pickle.load( open( "hotel_test_clean.pkl", "rb" ) )
    test = pickle.load( open( "hotel_eval_clean.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_hotel_corpus.pkl", "rb"))
    return train, val, test, corpus

In [206]:
# Import Dataset B (Lemmatized, and stop words removed)
def import_dataset_B_swl():
    train = import_train_swl()
    val = pickle.load( open( "hotel_test_clean.pkl", "rb" ) )
    test = pickle.load( open( "hotel_eval_clean.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_hotel_corpus.pkl", "rb"))
    return train, val, test, corpus

In [207]:
# Import Dataset B (Augmented training data)
def import_dataset_B_aug():
    train = import_train_aug()
    val = pickle.load( open( "hotel_test_clean.pkl", "rb" ) )
    test = pickle.load( open( "hotel_eval_clean.pkl", "rb" ) )
    corpus = pickle.load( open( "clean_hotel_corpus.pkl", "rb"))
    return train, val, test, corpus

In [208]:
# A method for evaluating predictions and returning specific evaluation metrics
def print_results(name, test_predictions, test_y):
    print()
    print('Confusion matrix:')
    matrix = confusion_matrix(test_y, test_predictions)
    print(matrix)

    accuracy = accuracy_score(test_y, test_predictions)
    precision = precision_score(test_y, test_predictions)
    recall = recall_score(test_y, test_predictions)
    f1 = f1_score(test_y, test_predictions)
 
    results = pd.DataFrame([[name, accuracy, precision, recall, f1]], columns=('Classifier', 'Accuracy', 'Precision', 'Recall', 'F-Score'))
    print(results)
    
    return results, matrix

In [209]:
# A method for plotting the accuracy and the loss of each model
#  This method was taken and adapted from:
# (https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/)
def plot_history(trainingMem):
    acc = trainingMem.history['accuracy']
    eval_acc = trainingMem.history['val_accuracy']
    loss = trainingMem.history['loss']
    eval_loss = trainingMem.history['val_loss']
    x = range(1, len(acc) + 1) 
    
    plt.figure(figsize=(12,5))
    
    plt.subplot(1,2,1)
    plt.plot(x,acc,'b',label='Training accuracy')
    plt.plot(x,eval_acc,'r',label='Evaluation accuracy')
    plt.title('Training and evaluation accuracy')
    plt.legend()
    
    plt.subplot(1,2,2)
    plt.plot(x,loss,'b',label='Training loss')
    plt.plot(x,eval_loss,'r',label='Evaluation loss')
    plt.title('Training and evaluation loss')
    plt.legend()

# Vectorization
- Classification models require numerical inputs in the form of vectors

In [210]:
# A method for spliting a pandas dataframe into an x and y set of sentences and labels
def x_y_split(train, val, test):
    
    train_x = train['review'].values
    train_y = train['label'].values

    val_x = val['review'].values
    val_y = val['label'].values

    test_x = test['review'].values
    test_y = test['label'].values 
    return train_x, train_y, val_x, val_y, test_x, test_y

def y_split(train, val, test):
        
    train_x = train['review'].values
    train_y = train['label'].values

    val_x = val['review'].values
    val_y = val['label'].values

    test_x = test['review'].values
    test_y = test['label'].values 
    return train_y, val_y, test_y

In [211]:
# A method for calling and returning 3 seperate vectorizers with different ngram ranges 
def fetch_ngram_vectorizer(corpus):
    UGVectorizer = CountVectorizer(min_df=0,lowercase=False, ngram_range=(1,1))
    BGVectorizer = CountVectorizer(min_df=0,lowercase=False, ngram_range=(2,3))
    CGVectorizer = CountVectorizer(min_df=0,lowercase=False, ngram_range=(1,5))
    UGVectorizer.fit(corpus)
    BGVectorizer.fit(corpus)
    CGVectorizer.fit(corpus)
    return UGVectorizer, BGVectorizer, CGVectorizer

# A method for calling and returning 3 seperate vectorizers with different ngram ranges, and TF-IDF weighting
def fetch_ngram_vectorizer_tfidf(corpus):
    UG_tfidf = TfidfVectorizer(min_df=0,lowercase=False, ngram_range=(1,1))
    BG_tfidf = TfidfVectorizer(min_df=0,lowercase=False, ngram_range=(2,3))
    CG_tfidf = TfidfVectorizer(min_df=0,lowercase=False, ngram_range=(1,5))
    UG_tfidf.fit(corpus)
    BG_tfidf.fit(corpus)
    CG_tfidf.fit(corpus)
    return UG_tfidf, BG_tfidf, CG_tfidf

In [212]:
# A method for vectorizing my data with different ngram ranges, returns the different vectors stacked together
def transform_ngram_vec(train,val,test,corpus):
    UniVec, BiVec, CharVec = fetch_ngram_vectorizer(corpus)
    
    U_train = UniVec.transform(train)
    U_val = UniVec.transform(val)
    U_test = UniVec.transform(test)
    
    B_train = BiVec.transform(train)
    B_val = BiVec.transform(val)
    B_test = BiVec.transform(test)
    
    C_train = CharVec.transform(train)
    C_val = CharVec.transform(val)
    C_test = CharVec.transform(test)
    
    train_X = hstack((U_train, B_train, C_train)) 
    val_X = hstack((U_val, B_val, C_val)) 
    test_X = hstack((U_test, B_test, C_test))
    
    return train_X, val_X, test_X

# A method for vectorizing my data with different ngram ranges, and tf-idf weighting.
# Returns the different vectors stacked together
def transform_ngram_vec_tfidf(train,val,test,corpus):
    UniVec, BiVec, CharVec = fetch_ngram_vectorizer_tfidf(corpus)
    
    U_train = UniVec.transform(train)
    U_val = UniVec.transform(val)
    U_test = UniVec.transform(test)
    
    B_train = BiVec.transform(train)
    B_val = BiVec.transform(val)
    B_test = BiVec.transform(test)
    
    C_train = CharVec.transform(train)
    C_val = CharVec.transform(val)
    C_test = CharVec.transform(test)
    
    train_X = hstack((U_train, B_train, C_train)) 
    val_X = hstack((U_val, B_val, C_val)) 
    test_X = hstack((U_test, B_test, C_test))
    
    return train_X, val_X, test_X

# Logistic Regression

In [213]:
# This method returns the logistic regression classifier
def fetch_logr(train_X, train_y):
    logReg = LogisticRegression(solver='lbfgs',max_iter=10000)
    logReg.fit(train_X, train_y)
    return logReg

# This method applys logistic regression to the data, it returns predicted classes
def logr_predict(train_X, train_y, val_X, val_y, test_X, test_y):
    logReg = fetch_logr(train_X, train_y)
    lr_Accuracy = logReg.score(train_X, train_y)
    lr_Accuracy_eval = logReg.score(val_X, val_y)

    test_predictions = logReg.predict(test_X)

    print("Logistic Regression...")
    print("Training-Accuracy:" + str(lr_Accuracy))
    print("Validation-Accuracy:" + str(lr_Accuracy_eval))
    return test_predictions

In [214]:
# This method evaluates the logistic regression classifier,
# It takes in a split dataset but returns evaluation metrics from the test data.
def logistic_regression(train, val, test, corpus):
    train_x, train_y, val_x, val_y, test_x, test_y = x_y_split(train, val, test)
    train_X, val_X, test_X = transform_ngram_vec(train_x,val_x,test_x,corpus)
    logr_predictions = logr_predict(train_X, train_y, val_X, val_y, test_X, test_y)
    logr_results, logr_matrix = print_results("Logistic Regression", logr_predictions, test_y)
    return logr_results, logr_matrix 

# This method evaluates the logistic regression classifier with tf-idf weighting,
# It takes in a split dataset but returns evaluation metrics from the test data.
def logistic_regression_tfidf(train, val, test, corpus):
    train_x, train_y, val_x, val_y, test_x, test_y = x_y_split(train, val, test)
    train_X, val_X, test_X = transform_ngram_vec_tfidf(train_x,val_x,test_x,corpus)
    logr_predictions = logr_predict(train_X, train_y, val_X, val_y, test_X, test_y)
    logr_results, logr_matrix = print_results("Logistic Regression (TF-IDF)", logr_predictions, test_y)
    return logr_results, logr_matrix 

# Support Vector Machines

In [215]:
# This method performs truncated singular value decomposition on the input
# This particlar method was taken and adapted from:
# (https://www.kaggle.com/sanketrai/suggestion-mining)
def truncated_svd(x_train, x_val, x_test):
    svd = TruncatedSVD(n_components = 15)
    svd.fit(vstack((x_train, x_val, x_test)).tocsr())
    x_train_svd = svd.transform(x_train)
    x_val_svd = svd.transform(x_val)
    x_test_svd = svd.transform(x_test)

    scaler = StandardScaler()
    scaler.fit(np.concatenate((x_train_svd, x_val_svd, x_test_svd)))
    x_train_svd = scaler.transform(x_train_svd)
    x_val_svd = scaler.transform(x_val_svd)
    x_test_svd = scaler.transform(x_test_svd)
    return x_train_svd, x_val_svd, x_test_svd

# This method returns the suport vector machines
def fetch_SVM(x_train_svd, train_y):
    SVM = SVC(C = 0.1, probability = True)
    SVM.fit(x_train_svd, train_y)
    return SVM

# This method performs predictions using the support vector machines
def SVM_predict(train_X, train_y, val_X, val_y, test_X, test_y):
    x_train_svd, x_val_svd, x_test_svd = truncated_svd(train_X, val_X, test_X)
    SVM = fetch_SVM(x_train_svd, train_y)
    
    svm_Accuracy = SVM.score(x_train_svd, train_y)
    svm_Accuracy_val = SVM.score(x_val_svd, val_y)
    svm_test_pred = SVM.predict(x_test_svd)

    print("Support Vector Machine...")
    print("Training-Accuracy:" + str(svm_Accuracy))
    print("Validation-Accuracy:" + str(svm_Accuracy_val))
    return svm_test_pred

In [216]:
# This method evaluates the support vector machines classifier,
# It takes in a split dataset but returns evaluation metrics from the test data.
def support_vector_machine(train, val, test, corpus):
    train_x, train_y, val_x, val_y, test_x, test_y = x_y_split(train, val, test)
    train_X, val_X, test_X = transform_ngram_vec(train_x,val_x,test_x,corpus)
    predictions = SVM_predict(train_X, train_y, val_X, val_y, test_X, test_y)
    SVM_results, SVM_matrix = print_results("Support Vector Machine", predictions, test_y)
    return SVM_results, SVM_matrix 

# This method evaluates the support vector machines classifier with tf-idf,
# It takes in a split dataset but returns evaluation metrics from the test data.
def support_vector_machine_tfidf(train, val, test, corpus):
    train_x, train_y, val_x, val_y, test_x, test_y = x_y_split(train, val, test)
    train_X, val_X, test_X = transform_ngram_vec_tfidf(train_x,val_x,test_x,corpus)
    predictions = SVM_predict(train_X, train_y, val_X, val_y, test_X, test_y)
    SVM_results, SVM_matrix = print_results("Support Vector Machine (TF-IDF)", predictions, test_y)
    return SVM_results, SVM_matrix

# Word Embeddings

In [217]:
# Create and return word_embeddings, also return vocab_length
def train_word_embeddings(train, val, test, corpus, max_length):
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(corpus)
    
    x_train = train['review'].values 
    x_val = val['review'].values 
    x_test = test['review'].values 
    
    x_train = tokenizer.texts_to_sequences(x_train)
    x_val = tokenizer.texts_to_sequences(x_val)
    x_test = tokenizer.texts_to_sequences(x_test)
    
    vocab_len = len(tokenizer.word_index) +1
    
    train_X = pad_sequences(x_train, padding='post', maxlen=max_length)
    eval_X = pad_sequences(x_val, padding='post', maxlen=max_length)
    test_X = pad_sequences(x_test,padding='post', maxlen=max_length)
    
    return vocab_len, train_X, eval_X, test_X

In [218]:
# Create and return word_embeddings, also return vocab_length, max_length and labels
def vectorize_embeddings(train, val, test, corpus):
    max_length = 100

    train_y = train['label'].values
    val_y = val['label'].values
    test_y = test['label'].values 
    
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(corpus)
    
    x_train = train['review'].values
    x_val = val['review'].values
    x_test = test['review'].values
    
    train_X = tokenizer.texts_to_sequences(x_train)
    val_X = tokenizer.texts_to_sequences(x_val)
    test_X = tokenizer.texts_to_sequences(x_test)

    vocab_len = len(tokenizer.word_index) +1
    
    train_X = pad_sequences(train_X, padding='post', maxlen=max_length)
    val_X = pad_sequences(val_X, padding='post', maxlen=max_length)
    test_X = pad_sequences(test_X,padding='post', maxlen=max_length)
    return vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y

In [219]:
# Create and return embedding matrix
# This method was taken from:
# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1 
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [220]:
# Create and return glove embedding matrix
# This method was taken from:
# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
def create_glove_matrix(corpus):
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(corpus)
    embedding_dim = 300
    embedding_matrix = create_embedding_matrix('GloVe/glove.42B.300d.txt', tokenizer.word_index, embedding_dim)
    return embedding_dim, embedding_matrix

In [221]:
# create and return part of speech tags, return word embeddings
def pos_tokenize(train, val, test, corpus, max_length):

    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(corpus)

    pos_tags_train = train['review'].apply(lambda x : " ".join(item[1] for item in pos_tag(word_tokenize(x)))).values
    pos_tags_val = val['review'].apply(lambda x : " ".join(item[1] for item in pos_tag(word_tokenize(x)))).values
    pos_tags_test = test['review'].apply(lambda x : " ".join(item[1] for item in pos_tag(word_tokenize(x)))).values
  
    train_POS = tokenizer.texts_to_sequences(pos_tags_train)
    val_POS = tokenizer.texts_to_sequences(pos_tags_val)
    test_POS = tokenizer.texts_to_sequences(pos_tags_test)
    
    train_POS = pad_sequences(train_POS, padding='post', maxlen=max_length)
    val_POS = pad_sequences(val_POS, padding='post', maxlen=max_length)
    test_POS = pad_sequences(test_POS,padding='post', maxlen=max_length)
    
    return train_POS, val_POS, test_POS

# Keras
- Methods to present results

In [222]:
# Method to fit on training data, as well as to make predictions for a test dataset
def NN_predict(model, train_X, train_y, val_X, val_y, test_X, test_y):
    trainingMem = model.fit(train_X, train_y, epochs=10, 
                            verbose=False, validation_data=(val_X, val_y), 
                            batch_size=128, callbacks=early_stopping())
    
#     Uncomment the following line for graphs plotting the accuracy and loss over each epoch
#     plot_history(trainingMem)
    
    predictions = model.predict(test_X, batch_size = 128, verbose = 1)

    classes = (predictions[:, 0] >= 0.5).astype(int)

    loss, accuracy = model.evaluate(train_X, train_y, verbose = False)
    print('training data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss))

    loss, accuracy = model.evaluate(val_X, val_y, verbose = False)
    print('validation data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss))
    
    return classes


In [223]:
# Method to fit on training data, as well as to make predictions for a test dataset
def NN_predict2(train_X, train_y, val_X, val_y, test_X, test_y, model):
    
    trainingMem = model.fit(x = [train_X], y=train_y, epochs=5, 
                            verbose=False, validation_data=([val_X], val_y), 
                            batch_size=128, callbacks=early_stopping())

#     Uncomment the following line for graphs plotting the accuracy and loss over each epoch
#     plot_history(trainingMem)
    
    predictions = model.predict([test_X], batch_size = 128, verbose = 1)

    classes = (predictions[:, 0] >= 0.5).astype(int)

    loss, accuracy = model.evaluate([train_X], train_y, verbose = False)
    print('training data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss))

    loss, accuracy = model.evaluate([val_X], val_y, verbose = False)
    print('validation data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss))
    
    return classes

In [224]:
# Method to fit on training data, as well as to make predictions for a test dataset with multiple inputs
def NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model):
    trainingMem = model.fit(x = [train_X, train_POS], y=train_y, epochs=5, 
                        verbose=False, validation_data=([val_X, val_POS], val_y), 
                        batch_size=128, callbacks=early_stopping())
    
#     Uncomment the following line for graphs plotting the accuracy and loss over each epoch
#     plot_history(trainingMem)
    
    Com_POS_test_pred_p = model.predict([test_X, test_POS], batch_size = 128, verbose = 1)

    Com_POS_test_pred_c = (Com_POS_test_pred_p[:, 0] >= 0.5).astype(int)

    loss, accuracy = model.evaluate([train_X, train_POS], train_y, verbose = False)
    print('training data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss))


    loss, accuracy = model.evaluate([val_X, val_POS], val_y, verbose = False)
    print('validation data...')
    print('Accuracy: ' + str(accuracy))
    print('Loss:' + str(loss)) 
    return Com_POS_test_pred_c 




In [225]:
# Callback method early_stopping
def early_stopping():
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=1, verbose=1)
    callbacks_list = [early_stopping]
    return callbacks_list

# LSTM

In [226]:
# Long Short Term Memory Model
def LSTM_model(vocab_length,embedding_dim,max_length):
    LSTM_model = Sequential()
    LSTM_model.add(layers.Embedding(input_dim=vocab_length, 
                               output_dim=embedding_dim, 
                               input_length=max_length,
                                 trainable=True))

    LSTM_model.add(LSTM(units = 150, return_sequences = True))
    LSTM_model.add(layers.MaxPooling1D())
    LSTM_model.add(LSTM(units = 150, return_sequences = True))
    LSTM_model.add(layers.GlobalMaxPooling1D())
    LSTM_model.add(layers.Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    LSTM_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     LSTM_model.summary()
    return LSTM_model

# Long Short Term Memory Model - Glove
def PTWE_LSTM_model(embedding_matrix, vocab_len, embedding_dim, max_length):
    LSTM_model = Sequential()
    LSTM_model.add(layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False))

    LSTM_model.add(LSTM(units = 150, return_sequences = True))
    LSTM_model.add(layers.MaxPooling1D())
    LSTM_model.add(LSTM(units = 150, return_sequences = True))
    LSTM_model.add(layers.GlobalMaxPooling1D())
    LSTM_model.add(layers.Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    LSTM_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     LSTM_model.summary()
    return LSTM_model

# Long Short Term Memory Model - POS tags
def LSTM_model_pos(vocab_len,embedding_dim,max_length):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=False)(inp)
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 50, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Long Short Term Memory Model - Glove & POS tags
def PTWE_LSTM_model_pos(embedding_matrix, vocab_len, embedding_dim, max_length):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 50, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

In [227]:
# This method creates an embedding matrix and returns the LSTM POS tag model
def fetch_glove_lstm_model_pos(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = HYB_model_glove_pos(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

# This method fetches the LSTM POS tag model
def fetch_lstm_model_pos(embedding_dim, max_length, vocab_len):
    model = HYB_model_pos(max_length, embedding_dim, vocab_len)
    return model

In [228]:
# This method evaluates the LSTM classifier,
# It takes in a split dataset but returns evaluation metrics from the test data.
def lstm(train, val, test, corpus):
    embedding_dim = 50
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    model =  LSTM_model(vocab_len,embedding_dim,max_length)
    predictions = NN_predict(model, train_X, train_y, val_X, val_y, test_X, test_y)
    return print_results("LSTM", predictions, test_y)
   
# This method evaluates the LSTM classifier, using glove word embeddings
# It takes in a split dataset but returns evaluation metrics from the test data.    
def lstm_glove(train, val, test, corpus):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    model =  PTWE_LSTM_model(embedding_matrix, vocab_len, embedding_dim, max_length)
    predictions = NN_predict(model, train_X, train_y, val_X, val_y, test_X, test_y)
    results, matrix = print_results("LSTM - Glove", predictions, test_y)  
    return results, matrix

# This method evaluates the LSTM classifier, using part of speech tags
# It takes in a split dataset but returns evaluation metrics from the test data.
def lstm_pos(train, val, test, corpus):
    embedding_dim = 50
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model =  fetch_lstm_model_pos(embedding_dim,max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    return print_results("LSTM - POS", predictions, test_y)
   
# This method evaluates the LSTM classifier, using glove word embeddings and part of speech tags
# It takes in a split dataset but returns evaluation metrics from the test data.    
def lstm_glove_pos(train, val, test, corpus):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model =  fetch_glove_lstm_model_pos(corpus, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("LSTM Glove & POS", predictions, test_y)  
    return results, matrix

# CNN

In [229]:
# Convolutional Neural Network Model
def CNN_model(vocab_length,embedding_dim,max_length):
    CNN_model = Sequential()
    CNN_model.add(layers.Embedding(input_dim=vocab_length, 
                               output_dim=embedding_dim, 
                               input_length=max_length,
                                 trainable=True))

    CNN_model.add(layers.Conv1D(64, 7, activation='relu'))
    CNN_model.add(layers.MaxPooling1D(2))
    CNN_model.add(layers.Conv1D(64, 7, activation='relu'))
    CNN_model.add(layers.GlobalMaxPooling1D())
    CNN_model.add(layers.Dropout(0.5))
    CNN_model.add(layers.Dense(32, activation='relu'))
    CNN_model.add(layers.Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    CNN_model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
#     CNN_model.summary()
    return CNN_model

# Convolutional Neural Network Model - Glove
def PTWE_CNN_model(embedding_matrix, vocab_len, embedding_dim, max_length):
    CNN_model = Sequential()
    CNN_model.add(layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False))

    CNN_model.add(layers.Conv1D(64, 7, activation='relu'))
    CNN_model.add(layers.MaxPooling1D(2))
    CNN_model.add(layers.Conv1D(64, 7, activation='relu'))
    CNN_model.add(layers.GlobalMaxPooling1D())
    CNN_model.add(layers.Dropout(0.5))
    CNN_model.add(layers.Dense(32, activation='relu'))
    CNN_model.add(layers.Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    CNN_model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
#     CNN_model.summary()
    return CNN_model

# Convolutional Neural Network Model - POS tags
def CNN_model_pos(vocab_length,embedding_dim,max_length):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=False)(inp)
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])

    x = layers.Conv1D(64, 7, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(64, 7, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Convolutional Neural Network Model - Glove & POS tags
def PTWE_CNN_model_pos(embedding_matrix, vocab_len, embedding_dim, max_length):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])

    x = layers.Conv1D(64, 7, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(64, 7, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

In [230]:
# fetch CNN model and create glove matrix - POS tags model
def fetch_glove_CNN_model_pos(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = HYB_model_glove_pos(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

# fetch CNN model - POS tags model
def fetch_CNN_model_pos(embedding_dim, max_length, vocab_len):
    model = HYB_model_pos(max_length, embedding_dim, vocab_len)
    return model

In [231]:
# This method evaluates the CNN classifier,
# It takes in a split dataset but returns evaluation metrics from the test data.
def cnn(train, val, test, corpus):
    embedding_dim = 50
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    model =  CNN_model(vocab_len,embedding_dim,max_length)
    predictions = NN_predict(model, train_X, train_y, val_X, val_y, test_X, test_y)
    return print_results("CNN", predictions, test_y)
   
# This method evaluates the CNN classifier, with Glove pretrained word embeddings
# It takes in a split dataset but returns evaluation metrics from the test data.    
def cnn_glove(train, val, test, corpus):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    model =  PTWE_CNN_model(embedding_matrix, vocab_len, embedding_dim, max_length)
    predictions = NN_predict(model, train_X, train_y, val_X, val_y, test_X, test_y)
    results, matrix = print_results("CNN - Glove", predictions, test_y)  
    return results, matrix

# This method evaluates the CNN classifier, with POS tags
# It takes in a split dataset but returns evaluation metrics from the test data.
def cnn_pos(train, val, test, corpus):
    embedding_dim = 50
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model =  fetch_CNN_model_pos(embedding_dim,max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    return print_results("CNN - POS", predictions, test_y)
   
# This method evaluates the CNN classifier, with Glove and POS tags
# It takes in a split dataset but returns evaluation metrics from the test data.    
def cnn_glove_pos(train, val, test, corpus):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    max_length = 100
    train_y, val_y, test_y = y_split(train, val, test)
    vocab_len, train_X, val_X, test_X = train_word_embeddings(train, val, test, corpus, max_length)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model =  fetch_glove_CNN_model_pos(corpus, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("CNN - Glove & POS", predictions, test_y)  
    return results, matrix

# C-LSTM

In [232]:
# Convolutional - Long Short Term Memory model - Glove
def HYB_model_glove(embedding_matrix, max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    x = emb_word

    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Convolutional - Long Short Term Memory model
def HYB_model(max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    emb_word = layers.Embedding(input_dim=vocab_len, 
                               output_dim=embedding_dim, 
                               input_length=max_length,
                                 trainable=True)(inp)
    x = emb_word

    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Convolutional - Long Short Term Memory model - Glove & POS tags
def HYB_model_glove_pos(embedding_matrix, max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])

    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Convolutional - Long Short Term Memory model - POS tags
def HYB_model_pos(max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=False)(inp)
    
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])

    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.LSTM(units = 150, return_sequences = True)(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    x = layers.Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp,pos], outputs=out)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

In [233]:
# Creates the glove embedding matrix and returns the model
def hyb_glove_model(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = HYB_model_glove(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

# fetches the model
def hyb_model(embedding_dim, max_length, vocab_len):
    model = HYB_model(max_length, embedding_dim, vocab_len)
    return model

# creates the glove embedding matrix and fetches the model
def hyb_glove_model_pos(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = HYB_model_glove_pos(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

# fetches the model
def hyb_model_pos(embedding_dim, max_length, vocab_len):
    model = HYB_model_pos(max_length, embedding_dim, vocab_len)
    return model

In [234]:
#  This method evaluates the C-LSTM classifier,
# It takes in a split dataset but returns evaluation metrics from the test data. 
def clstm(train, val, test, corpus):
    embedding_dim = 300
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    model = hyb_model(embedding_dim, max_length, vocab_len)
    predictions = NN_predict2(train_X, train_y, val_X, val_y, test_X, test_y, model)
    results, matrix = print_results("C-LSTM", predictions, test_y)  
    return results, matrix

#  This method evaluates the C-LSTM classifier, with Glove 
# It takes in a split dataset but returns evaluation metrics from the test data. 
def clstm_glove(train, val, test, corpus):
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    model = hyb_glove_model(corpus, max_length, vocab_len)
    predictions = NN_predict2(train_X, train_y, val_X, val_y, test_X, test_y, model)
    results, matrix = print_results("C-LSTM - GloVe", predictions, test_y)  
    return results, matrix

#  This method evaluates the C-LSTM classifier, with POS tags
# It takes in a split dataset but returns evaluation metrics from the test data. 
def clstm_pos(train, val, test, corpus):
    embedding_dim = 300
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model = hyb_model_pos(embedding_dim, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("C-LSTM - POS", predictions, test_y)  
    return results, matrix

#  This method evaluates the C-LSTM classifier, with Glove and POS tags
# It takes in a split dataset but returns evaluation metrics from the test data. 
def clstm_glove_pos(train, val, test, corpus):
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model = hyb_glove_model_pos(corpus, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("C-LSTM - GloVe & POS", predictions, test_y)  
    return results, matrix

# Ensemble Classifier
- (LSTM + CNN)

In [235]:
# Ensemble Classifier - LSTM + CNN
def combined_model(max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))

    emb_word = layers.Embedding(input_dim=vocab_len, 
                               output_dim=embedding_dim, 
                               input_length=max_length,
                                 trainable=True)(inp)
    
    x = emb_word
    y = x
    
    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.MaxPooling1D(2)(y)
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.GlobalMaxPooling1D()(y)
    
    z = Concatenate()([x,y])
    out = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[inp], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Ensemble Classifier - LSTM + CNN & Glove
def combined_glove_model(embedding_matrix, max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    
    x = emb_word
    y = x
    
    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.MaxPooling1D(2)(y)
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.GlobalMaxPooling1D()(y)
    
    z = Concatenate()([x,y])
    out = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[inp], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Ensemble Classifier - LSTM + CNN & POS tags
def combined_pos_model(max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=False)(inp)
    
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])
    y = x
    
    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.MaxPooling1D(2)(y)
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.GlobalMaxPooling1D()(y)
    
    z = Concatenate()([x,y])
    out = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[inp, pos], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

# Ensemble Classifier - LSTM + CNN & Glove + POS tags
def combined_glove_pos_model(embedding_matrix, max_length, embedding_dim, vocab_len):
    inp = Input(shape=(100,))
    pos = Input(shape=(100,))

    emb_word = layers.Embedding(vocab_len, embedding_dim, 
                              weights=[embedding_matrix], input_length=max_length, trainable=False)(inp)
    
    pos_in = layers.Embedding(vocab_len, embedding_dim, input_length=max_length, trainable=True)(pos)
    x = Concatenate()([emb_word,pos_in])
    y = x
    
    x = layers.Conv1D(120, 9, activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(80, 9, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.MaxPooling1D(2)(y)
    y = layers.LSTM(units = 150, return_sequences = True)(y)
    y = layers.GlobalMaxPooling1D()(y)
    
    z = Concatenate()([x,y])
    out = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[inp, pos], outputs=out)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
#     model.summary()
    return model

In [236]:
# Creates glove embedding matrix & returns the model
def com_glove_model(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = combined_glove_model(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

# fetches the model
def com_model(embedding_dim, max_length, vocab_len):
    model = combined_model(max_length, embedding_dim, vocab_len)
    return model

# creates the glove embedding matrix & returns the model (POS)
def com_model_pos(embedding_dim, max_length, vocab_len):
    model = combined_pos_model(max_length, embedding_dim, vocab_len)
    return model

# fetches the model (POS)
def com_glove_model_pos(corpus, max_length, vocab_len):
    embedding_dim, embedding_matrix = create_glove_matrix(corpus)
    model = combined_glove_pos_model(embedding_matrix, max_length, embedding_dim, vocab_len)
    return model

In [237]:
#  This method evaluates the Ensemble classifier, 
# It takes in a split dataset but returns evaluation metrics from the test data. 
def ensemble_classifier(train, val, test, corpus):
    embedding_dim = 300
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    model = com_model(embedding_dim, max_length, vocab_len)
    predictions = NN_predict2(train_X, train_y, val_X, val_y, test_X, test_y, model)
    results, matrix = print_results("Ensemble Classifier", predictions, test_y)  
    return results, matrix

#  This method evaluates the Ensemble classifier, with Glove 
# It takes in a split dataset but returns evaluation metrics from the test data. 
def ensemble_classifier_glove(train, val, test, corpus):
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    model = com_glove_model(corpus, max_length, vocab_len)
    predictions = NN_predict2(train_X, train_y, val_X, val_y, test_X, test_y, model)
    results, matrix = print_results("Ensemble Classifier - GloVe", predictions, test_y)  
    return results, matrix

#  This method evaluates the Ensemble classifier, with POS tags
# It takes in a split dataset but returns evaluation metrics from the test data. 
def ensemble_classifier_pos(train, val, test, corpus):
    embedding_dim = 300
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model = com_model_pos(embedding_dim, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("Ensemble Classifier - POS tags", predictions, test_y)  
    return results, matrix

#  This method evaluates the Ensemble classifier, with Glove and POS tags
# It takes in a split dataset but returns evaluation metrics from the test data. 
def ensemble_classifier_glove_pos(train, val, test, corpus):
    embedding_dim = 300
    vocab_len, max_length, train_X, train_y, val_X, val_y, test_X, test_y = vectorize_embeddings(train, val, test, corpus)
    train_POS, val_POS, test_POS = pos_tokenize(train, val, test, corpus, max_length)
    model = com_glove_model_pos(corpus, max_length, vocab_len)
    predictions = NN_predict_pos(train_X, train_POS, train_y, val_X, val_POS, val_y, test_X, test_POS, test_y, model)
    results, matrix = print_results("Ensemble Classifier - Glove & POS", predictions, test_y)  
    return results, matrix

# Experiments

Experiment01: TF-IDF weighting experiment
- Logistic Regression
- Support Vector Machines

Compares the use of ngram vectors to similar vectors that have tf-idf weighting applied on both dataset A and dataset B

In [238]:
# Experiment part 1: dataset A
def tf_idf_experiment_A():
    train, val, test, corpus = import_dataset_A()
    lr_results, lr_matrix = logistic_regression(train, val, test, corpus)
    print()
    lr_results2, lr_matrix2 = logistic_regression_tfidf(train, val, test, corpus)
    print()
    svm_results, svm_matrix = support_vector_machine(train, val, test, corpus)
    print()
    svm_results2, svm_matrix2 = support_vector_machine_tfidf(train, val, test, corpus)
    print()
    results = pd.concat([lr_results, lr_results2, svm_results, svm_results2])
    matrix = [lr_matrix,lr_matrix2,svm_matrix,svm_matrix2]
    return results, matrix

# Experiment part 2: dataset B
def tf_idf_experiment_B():
    train, val, test, corpus = import_dataset_B()
    lr_results, lr_matrix = logistic_regression(train, val, test, corpus)
    print()
    lr_results2, lr_matrix2 = logistic_regression_tfidf(train, val, test, corpus)
    print()
    svm_results, svm_matrix = support_vector_machine(train, val, test, corpus)
    print()
    svm_results2, svm_matrix2 = support_vector_machine_tfidf(train, val, test, corpus)
    print()
    results = pd.concat([lr_results, lr_results2, svm_results, svm_results2])
    matrix = [lr_matrix,lr_matrix2,svm_matrix,svm_matrix2]
    return results, matrix

# This method performs both parts of the experiment and returns results.
def tf_idf_experiment():
    print("TF-IDF Weighting Experiment...")
    print()
    print("Datatset A:")
    print()
    results_A, matrix_A = tf_idf_experiment_A()
    print("--------------------------------------------")
    print()
    print("Dataset B:")
    print()
    results_B, matrix_B = tf_idf_experiment_B()
    print()
    return results_A, matrix_A, results_B, matrix_B

In [239]:
# Run this cell to run the experiment
# This particular cell has a runtime of around 1 min.
results_A, matrix_A, results_B, matrix_B = tf_idf_experiment()
results_A.to_pickle("results_A.pkl")
results_B.to_pickle("results_B.pkl")

TF-IDF Weighting Experiment...

Datatset A:

Logistic Regression...
Training-Accuracy:0.9955572876071707
Validation-Accuracy:0.7584459459459459

Confusion matrix:
[[717  29]
 [ 32  55]]
            Classifier  Accuracy  Precision    Recall   F-Score
0  Logistic Regression  0.926771   0.654762  0.632184  0.643275

Logistic Regression...
Training-Accuracy:0.9829306313328137
Validation-Accuracy:0.7618243243243243

Confusion matrix:
[[705  41]
 [ 32  55]]
                     Classifier  Accuracy  Precision    Recall   F-Score
0  Logistic Regression (TF-IDF)  0.912365   0.572917  0.632184  0.601093

Support Vector Machine...
Training-Accuracy:0.7247856586126267
Validation-Accuracy:0.6672297297297297

Confusion matrix:
[[590 156]
 [ 32  55]]
               Classifier  Accuracy  Precision    Recall   F-Score
0  Support Vector Machine   0.77431   0.260664  0.632184  0.369128

Support Vector Machine...
Training-Accuracy:0.8214341387373344
Validation-Accuracy:0.7449324324324325

Confusion matri

----------------------------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------------------

Experiment02:
Pre-processing techniques experiment
- LSTM
- CNN
- C-LSTM
- Ensemble (CNN + LSTM)

Compares the use of stop word removal and lemmatization on neural network classifiers.

In [240]:
# This particular method runs all of the neural network classifiers in order
def nn_classify(train, val, test, corpus):
    results1, matrix1 = lstm(train, val, test, corpus)
    print()
    results2, matrix2 = lstm_glove(train, val, test, corpus)
    print()
    results3, matrix3 = cnn(train, val, test, corpus)
    print()
    results4, matrix4 = cnn_glove(train, val, test, corpus)
    print()
    results5, matrix5 = clstm(train, val, test, corpus)
    print()
    results6, matrix6 = clstm_glove(train, val, test, corpus)
    print()
    results7, matrix7 = ensemble_classifier(train, val, test, corpus)
    print()
    results8, matrix8 = ensemble_classifier_glove(train, val, test, corpus)
    print()
    results = pd.concat([results1, results2, results3, results4, results5, results6, results7, results8])
    matrix = [matrix1, matrix2, matrix3, matrix4, matrix5, matrix6, matrix7, matrix8]
    return results, matrix

# This particular method runs all of the neural network classifiers with pos tags as an input 
def nn_classify_pos(train, val, test, corpus):
    results1, matrix1 = lstm_pos(train, val, test, corpus)
    print()
    results2, matrix2 = lstm_glove_pos(train, val, test, corpus)
    print()
    results3, matrix3 = cnn_pos(train, val, test, corpus)
    print()
    results4, matrix4 = cnn_glove_pos(train, val, test, corpus)   
    print()
    results5, matrix5 = clstm_pos(train, val, test, corpus)
    print()
    results6, matrix6 = clstm_glove_pos(train, val, test, corpus)
    print()
    results7, matrix7 = ensemble_classifier_pos(train, val, test, corpus)
    print()
    results8, matrix8 = ensemble_classifier_glove_pos(train, val, test, corpus)
    print()
    results = pd.concat([results1, results2, results3, results4, results5, results6, results7, results8])
    matrix = [matrix1, matrix2, matrix3, matrix4, matrix5, matrix6, matrix7, matrix8]
    return results, matrix

In [241]:
# Experiment part 1: dataset A
def pp_experiment_A():
    train, val, test, corpus = import_dataset_A()
    results1, matrix1 = nn_classify(train, val, test, corpus)
    train, val, test, corpus = import_dataset_A_swl()
    results2, matrix2 = nn_classify(train, val, test, corpus)
    return results1, results2, matrix1, matrix2

# Experiment part 2: dataset B
def pp_experiment_B():
    train, val, test, corpus = import_dataset_B()
    results1, matrix1 = nn_classify(train, val, test, corpus)
    train, val, test, corpus = import_dataset_B_swl()
    results2, matrix2 = nn_classify(train, val, test, corpus)
    return results1, results2, matrix1, matrix2

# This method performs both parts of the experiment and returns results.
def pp_experiment():
    print("Pre-processing techniques experiment...")
    print()
    print("Dataset A:")
    print()
    results1, results2, matrix1, matrix2 = pp_experiment_A()
    print("-----------------------------------------------")
    print()
    print("Dataset B:")
    print()
    results3, results4, matrix3, matrix4 = pp_experiment_B()
    return results1, results2, matrix1, matrix2, results3, results4, matrix3, matrix4

In [242]:
# Run this cell to run the experiment
# This particular cell has a runtime of around 20-25 min.
results1, results2, matrix1, matrix2, results3, results4, matrix3, matrix4 = pp_experiment()
results1.to_pickle("results_1.pkl")
results2.to_pickle("results_2.pkl")
results3.to_pickle("results_3.pkl")
results4.to_pickle("results_4.pkl")

Pre-processing techniques experiment...

Dataset A:

Epoch 00002: early stopping
training data...
Accuracy: 0.9272798299789429
Loss:0.2389167696237564
validation data...
Accuracy: 0.7668918967247009
Loss:0.5746933221817017

Confusion matrix:
[[695  51]
 [ 29  58]]
  Classifier  Accuracy  Precision    Recall   F-Score
0       LSTM  0.903962    0.53211  0.666667  0.591837

Epoch 00003: early stopping
training data...
Accuracy: 0.7868277430534363
Loss:0.5217134356498718
validation data...
Accuracy: 0.7297297120094299
Loss:0.6893956661224365

Confusion matrix:
[[674  72]
 [ 36  51]]
     Classifier  Accuracy  Precision    Recall   F-Score
0  LSTM - Glove  0.870348   0.414634  0.586207  0.485714

Epoch 00002: early stopping
training data...
Accuracy: 0.9525331258773804
Loss:0.15320146083831787
validation data...
Accuracy: 0.7347972989082336
Loss:0.6285487413406372

Confusion matrix:
[[656  90]
 [ 25  62]]
  Classifier  Accuracy  Precision    Recall   F-Score
0        CNN  0.861945   0.40789

validation data...
Accuracy: 0.6881188154220581
Loss:0.7246740460395813

Confusion matrix:
[[399  77]
 [180 168]]
       Classifier  Accuracy  Precision    Recall  F-Score
0  C-LSTM - GloVe  0.688107   0.685714  0.482759  0.56661

Epoch 00002: early stopping
training data...
Accuracy: 0.9353078603744507
Loss:0.1833030879497528
validation data...
Accuracy: 0.594059407711029
Loss:1.1916375160217285

Confusion matrix:
[[456  20]
 [283  65]]
            Classifier  Accuracy  Precision    Recall   F-Score
0  Ensemble Classifier  0.632282   0.764706  0.186782  0.300231

Epoch 00002: early stopping
training data...
Accuracy: 0.9236944913864136
Loss:0.2020927369594574
validation data...
Accuracy: 0.6076732873916626
Loss:1.0626261234283447

Confusion matrix:
[[465  11]
 [280  68]]
                    Classifier  Accuracy  Precision    Recall   F-Score
0  Ensemble Classifier - GloVe  0.646845   0.860759  0.195402  0.318501

Epoch 00002: early stopping
training data...
Accuracy: 0.853390514850616

----------------------------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------------------

Experiment03:
Data Augmentation experiment
- LSTM
- CNN
- C-LSTM
- Ensemble (CNN + LSTM)

Compares the use of augmented training data on neural network classifiers.

In [243]:
# Experiment part 1: dataset A
def aug_experiment_A(): 
    train, val, test, corpus = import_dataset_A_aug()
    results, matrix = nn_classify(train, val, test, corpus)
    return results, matrix

# Experiment part 2: dataset B
def aug_experiment_B():
    train, val, test, corpus = import_dataset_B_aug()
    results, matrix = nn_classify(train, val, test, corpus)
    return results, matrix

# This method performs both parts of the experiment and returns results.
def aug_experiment():
    print("Data Augmentation experiment...")
    print()
    print("Dataset A:")
    print()
    results1, matrix1 = aug_experiment_A()
    print("------------------------------------------------")
    print()
    print("Dataset B:")
    print()
    results2, matrix2 = aug_experiment_B()
    return results1, results2, matrix1, matrix2

In [244]:
# Run this cell to run the experiment
# This particular cell has a runtime of around 10-15 min.
results5, results6, matrix5, matrix6 = aug_experiment()
results5.to_pickle("results_5.pkl")
results6.to_pickle("results_6.pkl")

Data Augmentation experiment...

Dataset A:

Epoch 00002: early stopping
training data...
Accuracy: 0.8665627241134644
Loss:0.34305328130722046
validation data...
Accuracy: 0.7432432174682617
Loss:0.5933619141578674

Confusion matrix:
[[697  49]
 [ 35  52]]
  Classifier  Accuracy  Precision    Recall   F-Score
0       LSTM   0.89916   0.514851  0.597701  0.553191

Epoch 00003: early stopping
training data...
Accuracy: 0.8825019598007202
Loss:0.28956174850463867
validation data...
Accuracy: 0.8091216087341309
Loss:0.4254849851131439

Confusion matrix:
[[675  71]
 [ 20  67]]
     Classifier  Accuracy  Precision    Recall   F-Score
0  LSTM - Glove  0.890756   0.485507  0.770115  0.595556

Epoch 00002: early stopping
training data...
Accuracy: 0.9438425302505493
Loss:0.1699533611536026
validation data...
Accuracy: 0.75
Loss:0.6121662855148315

Confusion matrix:
[[638 108]
 [ 23  64]]
  Classifier  Accuracy  Precision    Recall   F-Score
0        CNN  0.842737   0.372093  0.735632  0.494208

----------------------------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------------------

Experiment04:
Part of Speech tags experiment
- LSTM
- CNN
- C-LSTM
- Ensemble (CNN + LSTM)

Compares the use of POS tags when used with neural network classifiers.

In [245]:
# Experiment part 1: dataset A
def pos_experiment_A():
    train, val, test, corpus = import_dataset_A()
    results, matrix = nn_classify_pos(train, val, test, corpus)
    return results, matrix

# Experiment part 2: dataset B
def pos_experiment_B():
    train, val, test, corpus = import_dataset_B()
    results, matrix = nn_classify_pos(train, val, test, corpus)
    return results, matrix

# This method performs both parts of the experiment and returns results.
def pos_experiment():
    print("Part of speech tags experiment...")
    print()
    print("Dataset A:")
    print()
    results1, matrix1, = pos_experiment_A()
    print("------------------------------------------------")
    print()
    print("Dataset B:")
    print()
    results2, matrix2  = pos_experiment_B()
    return results1, results2, matrix1, matrix2

In [246]:
# Run this cell to run the experiment
# This particular cell has a runtime of around 10-15 min.
results7, results8, matrix7, matrix8 = pos_experiment()
results7.to_pickle("results_7.pkl")
results8.to_pickle("results_8.pkl")

Part of speech tags experiment...

Dataset A:

Epoch 00003: early stopping
training data...
Accuracy: 0.8710833787918091
Loss:0.355301171541214
validation data...
Accuracy: 0.7060810923576355
Loss:0.6345945596694946

Confusion matrix:
[[656  90]
 [ 41  46]]
   Classifier  Accuracy  Precision    Recall   F-Score
0  LSTM - POS  0.842737   0.338235  0.528736  0.412556

Epoch 00002: early stopping
training data...
Accuracy: 0.9526110887527466
Loss:0.1420421451330185
validation data...
Accuracy: 0.712837815284729
Loss:0.8599256873130798

Confusion matrix:
[[724  22]
 [ 46  41]]
         Classifier  Accuracy  Precision    Recall   F-Score
0  LSTM Glove & POS  0.918367   0.650794  0.471264  0.546667

Epoch 00003: early stopping
training data...
Accuracy: 0.8444271087646484
Loss:0.39387041330337524
validation data...
Accuracy: 0.6587837934494019
Loss:0.6594576239585876

Confusion matrix:
[[693  53]
 [ 51  36]]
  Classifier  Accuracy  Precision    Recall   F-Score
0  CNN - POS   0.87515   0.404