# Multilayer Perceptron Classifier

## Globals Constants

In [None]:
BATCH_SIZE = 1000
NUM_TRAINING_DATA = 529488
NUM_TESTING_DATA = 125990
NUM_AUTHORS = 13440

## Generator Functions

Defines the functions used to stream in the vectors in the provided files for testing/training data.

In [None]:
import json
import numpy as np

def json_output_to_vector(json_output):
    """Takes in a loaded dictionary from the doc2vec output json and grabs the feature vector of that associated post."""
    split_list = json_output.strip('][').split(',')
    ret_value = []
    for item in split_list:
        ret_value.append(float(item))
    return ret_value

seen_aids = set()  # Stores which author id we have seen thus far
aid_to_id = {}  # Stores the mapping that converts the author id to an array index
def aid_to_label(json_id):
    """Takes in a loaded dictionary from the doc2vec output json and grabs the label of that associated post. It then
    outputs the label in the style of one-hot encoding in relation to the number of classes (number of authors)"""
    global seen_aids
    global aid_to_id
    
    aid = int(json_id)
    label = np.zeros((NUM_AUTHORS))
    
    # Translate the aid to an array index
    idx = -1
    if aid in seen_aids:
        idx = aid_to_id[aid]
    else:
        idx = len(seen_aids)
        seen_aids.add(aid)
        aid_to_id[aid] = idx
    
    # Set the label
    label[idx] = 1
    return label

def corpus_generator(filepath):
    """An infinite generator that reads the provided file and creates batches of vector/label. 
    It automatically resets itself."""
    file = open(filepath, "r")
    
    while True:
        batch_features = np.zeros((BATCH_SIZE, 50))
        batch_labels = np.zeros((BATCH_SIZE, NUM_AUTHORS))
        for i in range(BATCH_SIZE):
            try:
                data = json.loads(next(file))
                batch_features[i] = json_output_to_vector(data['output'])
                batch_labels[i] = aid_to_label(data['author_id'])
            except StopIteration:
                file.seek(0)
                break
        yield batch_features, batch_labels

## MLP Model

Defines the MLP model.

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD
import math

def create_classifier():
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim=50))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(NUM_AUTHORS, activation='relu'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

## Train and Evaluate Classifier: Unigram Chars

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_unigram_chars.json"
test_filepath = "doc2vec_outputs/inferred_testing_unigram_chars.json"

unigram_chars = create_classifier()
unigram_chars.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = unigram_chars.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Unigram Chars Accuracy:", result[1])

## Train and Evaluate Classifier: Bigram Chars

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_bigram_chars.json"
test_filepath = "doc2vec_outputs/inferred_testing_bigram_chars.json"

bigram_chars = create_classifier()
bigram_chars.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = bigram_chars.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Bigram Chars Accuracy:", result[1])

## Train and Evaluate Classifier: Trigram Chars

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_trigram_chars.json"
test_filepath = "doc2vec_outputs/inferred_testing_trigram_chars.json"

trigram_chars = create_classifier()
trigram_chars.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = trigram_chars.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Trigram Chars Accuracy:", result[1])

## Train and Evaluate Classifier: Unigram Words

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_unigram_words.json"
test_filepath = "doc2vec_outputs/inferred_testing_unigram_words.json"

unigram_words = create_classifier()
unigram_words.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = unigram_words.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Unigram Words Accuracy:", result[1])

## Train and Evaluate Classifier: Bigram Words

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_bigram_words.json"
test_filepath = "doc2vec_outputs/inferred_testing_bigram_words.json"

bigram_words = create_classifier()
bigram_words.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = bigram_words.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Bigram Words Accuracy:", result[1])

## Train and Evaluate Classifier: Trigram Words

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_trigram_words.json"
test_filepath = "doc2vec_outputs/inferred_testing_trigram_words.json"

trigram_words = create_classifier()
trigram_words.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = trigram_words.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Trigram Words Accuracy:", result[1])

## Train and Evaluate Classifier: Unigram POS

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_unigram_pos.json"
test_filepath = "doc2vec_outputs/inferred_testing_unigram_pos.json"

unigram_pos = create_classifier()
unigram_pos.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = unigram_pos.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Unigram POS Accuracy:", result[1])

## Train and Evaluate Classifier: Bigram POS

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_bigram_pos.json"
test_filepath = "doc2vec_outputs/inferred_testing_bigram_pos.json"

bigram_pos = create_classifier()
bigram_pos.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = bigram_pos.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Bigram POS Accuracy:", result[1])

## Train and Evaluate Classifier: Trigram POS

In [None]:
train_filepath = "doc2vec_outputs/inferred_training_trigram_pos.json"
test_filepath = "doc2vec_outputs/inferred_testing_trigram_pos.json"

trigram_pos = create_classifier()
trigram_pos.fit_generator(corpus_generator(train_filepath), 
                          steps_per_epoch=math.ceil(NUM_TRAINING_DATA / BATCH_SIZE), 
                          epochs=10)
result = trigram_pos.evaluate_generator(corpus_generator(test_filepath), 
                                        steps=math.ceil(NUM_TESTING_DATA / BATCH_SIZE))
print("Trigram POS Accuracy:", result[1])