Tests performance of LSTM models created in LSTM.ipynb

In [11]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, BatchNormalization
from keras.layers.core import Dense, Activation, Dropout
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [2]:
def load_data():
    train_data = pd.read_csv('TRAIN_DATA.csv')
    train_X = train_data.iloc[:, :-1]
    train_y = train_data[train_data.columns[-1]]
    val_data = pd.read_csv('VAL_DATA.csv')
    val_X = val_data.iloc[:, :-1]
    val_y = val_data[val_data.columns[-1]]
    test_data = pd.read_csv('TEST_DATA.csv')
    test_X = test_data.iloc[:, :-1]
    test_y = test_data[test_data.columns[-1]]

    train_X = Convert(train_X)
    val_X = Convert(val_X)
    test_X = Convert(test_X)
    return train_X, train_y, val_X, val_y, test_X, test_y

def Convert(x):
    data = []
    for index, row in x.iterrows():
        row_lst = []
        for post in row:
            if isinstance(post, float) and np.isnan(post):
                row_lst.append(np.zeros(768))
            else:
                a = post.split()
                lst = []
                for val in a:
                    if val != '[' and val[-1] != ']' and val[0] != '[':
                        b = float(val)
                        lst.append(b)
                    elif val[-1] == ']' and len(val) > 1:
                        val = val[:-1]
                        b = float(val)
                        lst.append(b)
                    elif val[0] == '[' and len(val) > 1:
                        val = val[1:]
                        b = float(val)
                        lst.append(b)
                row_lst.append(np.array(lst))
        data.append(np.array(row_lst))
    parsed_test_x = np.array(data)
    return parsed_test_x

In [3]:
train_X, train_y, val_X, val_y, test_X, test_y = load_data()

# Test Best Model 1
Running Model 1 in LSTM.ipynb 10 times and averaging the statistics. Note each iteration does not show the output because of the formatting. I fixed it for Test 2

In [60]:
def createLSTM(X_data, loss):
    model = Sequential()

    # layer 1: LSTM
    model.add(LSTM(400, input_shape=(X_data.shape[1], X_data.shape[2])
                   , return_sequences=True))
    model.add(Dropout(0.2))
    # layer 2: LSTM
    model.add(LSTM(150, return_sequences=False))
    model.add(Dropout(0.2))

    # output
    model.add(Dense(1))
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

In [64]:
def getLossModel(train_X, train_y, loss):
    model = createLSTM(train_X, loss)
    # fit network
    history = model.fit(train_X, train_y, epochs=40, batch_size=120, validation_data=(val_X, val_y), verbose=0)

    return model

In [65]:
loss = 'mean_squared_error'
num_of_test_samples = test_y.shape[0]
batch_size = 120
n = 10

precision_total = 0
recall_total = 0
f1_total = 0
accuracy_total = 0

for i in range(n):
    model = getLossModel(train_X, train_y, loss)
    test_mse = model.evaluate(test_X, test_y, verbose=1)
    y_pred = model.predict_classes(test_X, num_of_test_samples // batch_size+1)
    report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']
    accuracy = report['accuracy']
    
    print("Iteration: %d, precision: %d, recall: %d, F1: %d, accuracy: %d" % (i, precision, recall, f1, accuracy))
    
    precision_total += precision
    recall_total += recall
    f1_total += f1
    accuracy_total += accuracy

precision_total = precision_total / n * 100
recall_total = recall_total / n * 100
f1_total = f1_total / n * 100
accuracy_total = accuracy_total / n * 100
print("Average, precision: %d, recall: %d, F1: %d, accuracy: %d" % (precision_total, recall_total, f1_total, accuracy_total))

    

Iteration: 0, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 1, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 2, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 3, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 4, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 5, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 6, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 7, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 8, precision: 0, recall: 0, F1: 0, accuracy: 0
Iteration: 9, precision: 0, recall: 0, F1: 0, accuracy: 0
Average, precision: 91, recall: 91, F1: 91, accuracy: 94


# Test Best Model 2

In [66]:
def createLSTM2(X_data, loss):
    model = Sequential()

    # layer 1: LSTM
    model.add(LSTM(768, input_shape=(X_data.shape[1], X_data.shape[2]), return_sequences=False))
    
    # output
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

In [71]:
def getLossModel2(train_X, train_y, loss):
    model = createLSTM2(train_X, loss)
    # fit network
    history = model.fit(train_X, train_y, epochs=40, batch_size=120, validation_data=(val_X, val_y), verbose=0)

    return model

In [72]:
loss = 'binary_crossentropy'
num_of_test_samples = test_y.shape[0]
batch_size = 120
n = 10

precision_total = 0
recall_total = 0
f1_total = 0
accuracy_total = 0

for i in range(n):
    model = getLossModel2(train_X, train_y, loss)
    test_mse = model.evaluate(test_X, test_y, verbose=1)
    y_pred = model.predict_classes(test_X, num_of_test_samples // batch_size+1)
    report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']
    accuracy = report['accuracy']
    
    print("Iteration: %d, precision: %.2f, recall: %.2f, F1: %.2f, accuracy: %.2f" % (i, precision, recall, f1, accuracy))
    
    precision_total += precision
    recall_total += recall
    f1_total += f1
    accuracy_total += accuracy

precision_total = precision_total / n * 100
recall_total = recall_total / n * 100
f1_total = f1_total / n * 100
accuracy_total = accuracy_total / n * 100
print("Average, precision: %d, recall: %d, F1: %d, accuracy: %d" % (precision_total, recall_total, f1_total, accuracy_total))

Iteration: 0, precision: 0.93, recall: 0.94, F1: 0.93, accuracy: 0.96
Iteration: 1, precision: 0.94, recall: 0.93, F1: 0.94, accuracy: 0.96
Iteration: 2, precision: 0.93, recall: 0.94, F1: 0.93, accuracy: 0.96
Iteration: 3, precision: 0.94, recall: 0.93, F1: 0.93, accuracy: 0.96
Iteration: 4, precision: 0.93, recall: 0.94, F1: 0.93, accuracy: 0.96
Iteration: 5, precision: 0.93, recall: 0.93, F1: 0.93, accuracy: 0.96
Iteration: 6, precision: 0.91, recall: 0.95, F1: 0.93, accuracy: 0.95
Iteration: 7, precision: 0.94, recall: 0.94, F1: 0.94, accuracy: 0.96
Iteration: 8, precision: 0.93, recall: 0.94, F1: 0.93, accuracy: 0.96
Iteration: 9, precision: 0.93, recall: 0.94, F1: 0.94, accuracy: 0.96
Average, precision: 92, recall: 93, F1: 93, accuracy: 95
