In [None]:
import numpy as np
import pandas as pd
import time
import os

In [None]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [None]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [None]:
def loadLabels():
    return loadNLPVectors("labels")

In [None]:
from sklearn.model_selection import train_test_split
def genData(nlp):
    X_train, X_test, y_train, y_test = train_test_split(nlp, labels,
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        shuffle = True,
                                                        stratify = labels)
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Data

In [None]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

In [None]:
unigram = loadNLPVectors(unigram_array)
bigram = loadNLPVectors(bigram_array)
tfidf = loadNLPVectors(tfidf_array)
word2vec = loadNLPVectors(wordvec_array)
reduced_unigram = loadNLPVectors(unigram_reduced)
reduced_bigram = loadNLPVectors(bigram_reduced)
reduced_tfidf = loadNLPVectors(tfidf_reduced)
labels = loadLabels()

# Generate Training and Testing Data

In [None]:
train_uni, test_uni = genData(unigram)

In [None]:
train_big, test_big = genData(bigram)

In [None]:
train_tfidf, test_tfidf = genData(tfidf)

In [None]:
train_vec, test_vec = genData(word2vec)

In [None]:
train_runi, test_runi = genData(reduced_unigram)

In [None]:
train_rbig, test_rbig = genData(reduced_bigram)

In [None]:
train_rtfidf, test_rtfidf = genData(reduced_tfidf)

# Keras - Neural Networks

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def plotFigures(hist, epoch_range):
    
    validation_loss = hist.history['val_loss']
    validation_acc = hist.history['val_accuracy']
    
    epochs = range(1, epoch_range + 1)
    
    f = plt.figure(1)
    plt.title("Loss")
    plt.plot(epochs, validation_loss, 'bo')
    plt.xlabel('Epochs')
    plt.ylabel('Validation Loss')
    
    g = plt.figure(2)
    plt.title("Accuracy")
    plt.plot(epochs, validation_acc, 'ro')
    plt.xlabel('Epochs')
    plt.ylabel('Validation Accuracy')
    
    plt.show()

In [None]:
def evaluate(model, history, test, name):
    X_test = test[0]
    y_test = test[1]
    epoch_range = len(history.history['loss'])
    
    pred_sigmoid = model.predict(X_test)
    pred = []
    for p in pred_sigmoid:
        if p < .5:
            pred.append(0)
        else:
            pred.append(1)
    y_pred = np.asarray(pred)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(name + ' Evaluation: ')
    print('Accuracy:       ', acc)
    print('ROC AUC Score:  ', roc_auc)
    print('F1 Score:       ', f)
    print('Precision:      ', precision)
    print('Recall:         ', recall)
    
    plotFigures(history, epoch_range)

# Generate Sequential Neural Network

In [None]:
def DNN(train, test, iterations = 50, early_stop = False):
    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    dim = len(X_train[0])
    
    model = models.Sequential()
    if dim > 15000:
        model.add(layers.Dense(500, activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(64, activation = 'relu'))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, activation = 'relu'))
    elif dim > 10000:
        model.add(layers.Dense(200, activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(32, activation = 'relu'))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, activation = 'relu'))
    elif dim == 500:
        model.add(layers.Dense(64, activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(16, activation = 'relu'))
    elif dim == 200:
        model.add(layers.Dense(32, activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, activation = 'relu'))
    elif dim == 50:
        model.add(layers.Dense(16, activation = 'relu', input_shape = (dim,)))
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(4, activation = 'relu', input_shape = (dim, )))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    
    model.summary()
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    
    start = time.time()
    
    callback = [EarlyStopping(monitor = 'val_loss', min_delta = .0001)]
    if early_stop:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            verbose = 0,
                            batch_size = 128,
                            callbacks = callback,
                            validation_data = (X_test, y_test))
    else:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            verbose = 0,
                            batch_size = 128,
                            callbacks = callback,
                            validation_data = (X_test, y_test))
        
    end = time.time() - start
    
    test_vals = model.evaluate(X_test, y_test)
    
    print("Training Time:  ", end)
    print("Model Loss:     ", test_vals[0])
    print("Model Accuracy: ", test_vals[1])
    
    return model, history

# Generate Tokenized Data for CNN and RNN

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
def tokenData():
    tokenizer = Tokenizer()
    encoder = LabelEncoder()
    
    data = loadCSV("binary_classification")
    X_train, X_test, y_train, y_test = train_test_split(data["tweet_text"], data["tweet_class"],
                                                        test_size = 0.2, random_state = 42,
                                                        shuffle = True, 
                                                        stratify = data["tweet_class"])
    
    tokenizer.fit_on_texts(X_train)
    train_seq = tokenizer.texts_to_sequences(X_train)
    test_seq = tokenizer.texts_to_sequences(X_test)
    
    max_train = max([len(x) for x in train_seq])
    max_test = max([len(x) for x in test_seq])
    max_len = max([max_train, max_test])
    
    train_seq_array = pad_sequences(train_seq, maxlen = max_len)
    test_seq_array = pad_sequences(test_seq, maxlen = max_len)
    
    train_label = encoder.fit_transform(y_train)
    test_label = encoder.fit_transform(y_test)
    
    train = [train_seq_array, train_label]
    test = [test_seq_array, test_label]
    
    vocab_size = len(tokenizer.word_index) + 1
    
    return train, test, max_len, vocab_size

# Generate Convolutional Neural Network

In [None]:
def CNN(iterations = 50, early_stop = False):
    train, test, max_len, vocab_size = tokenData()

    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    model = models.Sequential()
    model.add(layers.Embedding(input_dim = vocab_size, output_dim = 32, input_length = max_len))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv1D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
    model.add(layers.MaxPooling1D(pool_size = 2))
    model.add(layers.Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(4, activation = 'relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    
    
    model.summary()
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    
    callback = [EarlyStopping(monitor = 'val_loss', min_delta = .0001)]
    
    start = time.time()
    if early_stop:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            batch_size = 128,
                            callbacks = callback,
                            validation_data = (X_test, y_test))
    else:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            batch_size = 128,
                            validation_data = (X_test, y_test))
    end = time.time() - start
    
    test_vals = model.evaluate(X_test, y_test)
    
    print("Training Time:  ", end)
    print("Model Loss:     ", test_vals[0])
    print("Model Accuracy: ", test_vals[1])
    
    return model, history, test

# Generate Recurrent Neural Network with LSTM

In [None]:
def RNN(iterations = 50, early_stop = False):
    train, test, max_len, vocab_size = tokenData()

    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    model = models.Sequential()
    model.add(layers.Embedding(input_dim = vocab_size, output_dim = 128, input_len = max_len))
    model.add(layers.SpatialDropout1D(0.5))
    model.add(layers.LSTM(128, dropout = 0.5, recurrent_dropout = 0.5))
    model.add(layers.Dense(32, activation = 'relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    
    model.summary()
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    
    callback = [EarlyStopping(monitor = 'val_loss', min_delta = .0001)]
    
    start = time.time()
    if early_stop:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            batch_size = 128,
                            callbacks = callback,
                            validation_data = (X_test, y_test))
    else:
        history = model.fit(X_train, y_train,
                            epochs = iterations,
                            batch_size = 128,
                            validation_data = (X_test, y_test))
    end = time.time() - start
    
    test_vals = model.evaluate(X_test, y_test)
    
    print("Training Time:  ", end)
    print("Model Loss:     ", test_vals[0])
    print("Model Accuracy: ", test_vals[1])
    
    return model, history, test

# Training Deep Neural Networks

### DNN Reduced Unigram

In [None]:
dnn_runi, dnn_runi_history = DNN(train_runi, test_runi)

### DNN Reduced Bigram

In [None]:
dnn_rbig, dnn_rbig_history = DNN(train_rbig, test_rbig)

### DNN Reduced TFIDF

In [None]:
dnn_rtfidf, dnn_rtfidf_history = DNN(train_rtfidf, test_rtfidf)

### DNN Word2Vec

In [None]:
dnn_vec, dnn_vec_history = DNN(train_vec, test_vec)

### DNN Unigram

In [None]:
dnn_uni, dnn_uni_history = DNN(train_uni, test_uni)

### DNN Bigram

In [None]:
dnn_big, dnn_big_history = DNN(train_big, test_big)

### DNN TFIDF

In [None]:
dnn_tfidf, dnn_tfidf_history = DNN(train_tfidf, test_tfidf)

## Evaluating Deep Neural Networks

In [None]:
evaluate(dnn_uni, dnn_uni_history, test_uni, 'DNN Unigram')

In [None]:
evaluate(dnn_big, dnn_big_history, test_big, 'DNN Bigram')

In [None]:
evaluate(dnn_tfidf, dnn_tfidf_history, test_tfidf, 'DNN TFIDF')

In [None]:
evaluate(dnn_runi, dnn_runi_history, test_runi, 'DNN Reduced Unigram')

In [None]:
evaluate(dnn_rbig, dnn_rbig_history, test_rbig, 'DNN Reduced Bigram')

In [None]:
evaluate(dnn_rtfidf, dnn_rtfidf_history, test_rtfidf, 'DNN Reduced TFIDF')

In [None]:
evaluate(dnn_vec, dnn_vec_history, test_vec, 'DNN Word2Vec')

# Training and Evaluating Convolutional Neural Networks

In [None]:
cnn_model, cnn_model_history, test_cnn = CNN()

In [None]:
evaluate(cnn_model, cnn_model_history, test_cnn, 'Convolutional Neural Network')

# Training and Evaluating Recurrent Neural Networks

In [None]:
rnn_model, rnn_model_history, test_rnn = RNN()

In [None]:
evaluate(rnn_model, rnn_model_history, test_rnn, 'Recurrent Neural Network')

# Training Deep Neural Networks - Early Stopping

### DNN Reduced Unigram

### DNN Reduced Bigram

### DNN Reduced TFIDF

### DNN Word2Vec

### DNN Unigram

### DNN Bigram

### DNN TFIDF

# Evaluating Deep Neural Networks - Early Stopping

# Training and Evaluating Convolutional Neural Network - Early Stopping

# Training and Evaluating Recurrent Neural Network - Early Stopping