In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define Loading Functions

In [None]:
def loadNLP(folder, name):
    file = 'nlp/scenario_2/%s/%s.npy' % (folder, name)
    return np.load(file)

In [None]:
def loadData(folder):
    X_train = loadNLP(folder, 'train')
    y_train = loadNLP('labels', 'train')
    
    X_test = loadNLP(folder, 'test')
    y_test = loadNLP('labels', 'test')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load Data

In [None]:
train_uni, test_uni = loadData('unigram')

In [None]:
train_tfu, test_tfu = loadData('unigram_tfidf')

In [None]:
train_big, test_big = loadData('bigram')

In [None]:
train_tfb, test_tfb = loadData('bigram_tfidf')

In [None]:
train_runi, test_runi = loadData('reduced_unigram')

In [None]:
train_rtfu, test_rtfu = loadData('reduced_unigram_tfidf')

In [None]:
train_rbig, test_rbig = loadData('reduced_bigram')

In [None]:
train_rtfb, test_rtfb = loadData('reduced_bigram_tfidf')

# Define Machine Learning Classifier Functions

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def genClassifiers():

    classifiers = [LogisticRegression(), RandomForestClassifier()]
    names = ['Logistic Regression', 'Random Forest']
    
    return classifiers, names

In [None]:
def evalClassifiers(train, test):
    
    clf_array, clf_names = genClassifiers()
    
    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    for i in range(0, len(clf_array)):
        start = time.time()
        clf_array[i].fit(X_train, y_train)
        end = time.time() - start
        
        y_pred = clf_array[i].predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        f_score = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        print(clf_names[i] + ': Completed in ' + str(end) + ' seconds')
        print(clf_names[i] + ' Accuracy: ' + str(accuracy))
        print(clf_names[i] + ' ROC AUC Score: ' + str(roc_auc))
        print(clf_names[i] + ' F Score: ' + str(f_score))
        print(clf_names[i] + ' Precision: ' + str(precision))
        print(clf_names[i] + ' Recall: ' + str(recall))
        print(" ")

# Evaluate Classifiers

## Unigram Classifiers

In [None]:
evalClassifiers(train_uni, test_uni)

## Unigram-Tfidf Classifiers

In [None]:
evalClassifiers(train_tfu, test_tfu)

## Bigram Classifiers

In [None]:
evalClassifiers(train_big, test_big)

## Bigram-Tfidf Classifiers

In [None]:
evalClassifiers(train_tfb, test_tfb)

## Reduced Unigram Classifiers

In [None]:
evalClassifiers(train_runi, test_runi)

## Reduced Unigram-Tfidf Classifiers

In [None]:
evalClassifiers(train_rtfu, test_rtfu)

## Reduced Bigram Classifiers

In [None]:
evalClassifiers(train_rbig, test_rbig)

## Reduced Bigram-Tfidf Classifiers

In [None]:
evalClassifiers(train_rtfb, test_rtfb)

# Define Neural Network Functions

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def DNN(train, test, iterations = 50, early_stop = False, info = 0):
    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    dim = len(X_train[0])
    
    model = models.Sequential()
    if dim >= 40000:
        model.add(layers.Dense(500, kernel_regularizer = l2(.001),
                               activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(128, kernel_regularizer = l2(.001),
                               activation = 'relu'))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(32, kernel_regularizer = l2(.001),
                               activation = 'relu'))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, kernel_regularizer = l2(.001),
                               activation = 'relu'))
    elif dim >= 9000:
        model.add(layers.Dense(200, kernel_regularizer = l2(.001),
                               activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(64, kernel_regularizer = l2(.001),
                               activation = 'relu'))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, kernel_regularizer = l2(.001),
                               activation = 'relu'))        
    elif dim >= 1000:
        model.add(layers.Dense(100, kernel_regularizer = l2(.001),
                               activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(16, kernel_regularizer = l2(.001),
                               activation = 'relu'))
    else:
        model.add(layers.Dense(16, kernel_regularizer = l2(.001),
                               activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, kernel_regularizer = l2(.001),
                               activation = 'relu', input_shape = (dim, )))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    
    model.summary()
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    
    start = time.time()
    
    callback = [EarlyStopping(monitor = 'val_loss', min_delta = .0001)]
    if early_stop:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            verbose = info,
                            batch_size = 128,
                            callbacks = callback,
                            validation_data = (X_test, y_test))
    else:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            verbose = info,
                            batch_size = 128,
                            validation_data = (X_test, y_test))
        
    end = time.time() - start
    
    test_vals = model.evaluate(X_test, y_test)
    
    print("Training Time:  ", end)
    print("Model Loss:     ", test_vals[0])
    print("Model Accuracy: ", test_vals[1])
    
    return model, history

In [None]:
def plotFigures(hist, epoch_range):
    
    training_loss = hist.history['loss']
    training_acc = hist.history['accuracy']
    
    validation_loss = hist.history['val_loss']
    validation_acc = hist.history['val_accuracy']
    
    epochs = range(1, epoch_range + 1)
    
    f = plt.figure(1)
    plt.title("Training and Validation Loss")
    plt.plot(epochs, training_loss, 'r', label = 'Training Loss')
    plt.plot(epochs, validation_loss, 'b', label = 'Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    g = plt.figure(2)
    plt.title("Training and Validation Accuracy")
    plt.plot(epochs, training_acc, 'r', label = 'Training Acc')
    plt.plot(epochs, validation_acc, 'b', label = 'Validation Acc')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.show()

In [None]:
def evaluate(model, history, test, name):
    X_test = test[0]
    y_test = test[1]
    epoch_range = len(history.history['loss'])
    
    pred_sigmoid = model.predict(X_test)
    pred = []
    for p in pred_sigmoid:
        if p < .5:
            pred.append(0)
        else:
            pred.append(1)
    y_pred = np.asarray(pred)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(name + ' Evaluation: ')
    print('Accuracy:       ', acc)
    print('ROC AUC Score:  ', roc_auc)
    print('F1 Score:       ', f)
    print('Precision:      ', precision)
    print('Recall:         ', recall)
    
    plotFigures(history, epoch_range)

# Training Neural Networks

In [None]:
dnn_uni, hist_uni = DNN(train_uni, test_uni)

In [None]:
dnn_tfu, hist_tfu = DNN(train_tfu, test_tfu)

In [None]:
dnn_big, hist_big = DNN(train_big, test_big)

In [None]:
dnn_tfb, hist_tfb = DNN(train_tfb, test_tfb)

In [None]:
dnn_runi, hist_runi = DNN(train_runi, test_runi)

In [None]:
dnn_rtfu, hist_rtfu = DNN(train_rtfu, test_rtfu)

In [None]:
dnn_rbig, hist_rbig = DNN(train_rbig, test_rbig)

In [None]:
dnn_rtfb, hist_rtfb = DNN(train_rtfb, test_rtfb)

# Evaluate Neural Networks

In [None]:
evaluate(dnn_uni, hist_uni, test_uni, 'DNN Unigram')

In [None]:
evaluate(dnn_tfu, hist_tfu, test_tfu, 'DNN Unigram-Tfidf')

In [None]:
evaluate(dnn_big, hist_big, test_big, 'DNN Bigram')

In [None]:
evaluate(dnn_tfb, hist_tfb, test_tfb, 'DNN Bigram-Tfidf')

In [None]:
evaluate(dnn_runi, hist_runi, test_runi, 'DNN Reduced Unigram')

In [None]:
evaluate(dnn_rtfu, hist_rtfu, test_rtfu, 'DNN Reduced Unigram-Tfidf')

In [None]:
evaluate(dnn_rbig, hist_rbig, test_rbig, 'DNN Reduced Bigram')

In [None]:
evaluate(dnn_rtfb, hist_rtfb, test_rtfb, 'DNN Reduced Bigram-Tfidf')

# Training Neural Networks - Early Stopping

In [None]:
dnn_uni_s, hist_uni_s = DNN(train_uni, test_uni, early_stop = True)

In [None]:
dnn_tfu_s, hist_tfu_s = DNN(train_tfu, test_tfu, early_stop = True)

In [None]:
dnn_big_s, hist_big_s = DNN(train_big, test_big, early_stop = True)

In [None]:
dnn_tfb_s, hist_tfb_s = DNN(train_tfb, test_tfb, early_stop = True)

In [None]:
dnn_runi_s, hist_runi_s = DNN(train_runi, test_runi, early_stop = True)

In [None]:
dnn_rtfu_s, hist_rtfu_s = DNN(train_rtfu, test_rtfu, early_stop = True)

In [None]:
dnn_rbig_s, hist_rbig_s = DNN(train_rbig, test_rbig, early_stop = True)

In [None]:
dnn_rtfb_s, hist_rtfb_s = DNN(train_rtfb, test_rtfb, early_stop = True)

# Evaluate Neural Networks - Early Stopping

In [None]:
evaluate(dnn_uni_s, hist_uni_s, test_uni, 'DNN Early Stop Unigram')

In [None]:
evaluate(dnn_tfu_s, hist_tfu_s, test_tfu, 'DNN Early Stop Unigram-Tfidf')

In [None]:
evaluate(dnn_big_s, hist_big_s, test_big, 'DNN Early Stop Bigram')

In [None]:
evaluate(dnn_tfb_s, hist_tfb_s, test_tfb, 'DNN Early Stop Bigram-Tfidf')

In [None]:
evaluate(dnn_runi_s, hist_runi_s, test_runi, 'DNN Early Stop Reduced Unigram')

In [None]:
evaluate(dnn_rtfu_s, hist_rtfu_s, test_rtfu, 'DNN Early Stop Reduced Unigram-Tfidf')

In [None]:
evaluate(dnn_rbig_s, hist_rbig_s, test_rbig, 'DNN Early Stop Reduced Bigram')

In [None]:
evaluate(dnn_rtfb_s, hist_rtfb_s, test_rtfb, 'DNN Early Stop Reduced Bigram-Tfidf')