# Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train = []
y_train = []
X_test = []
y_test = []
train_filepath = "doc2vec_outputs/cutdown_training_unigram_pos.json"
test_filepath = "doc2vec_outputs/cutdown_testing_unigram_pos.json"

Fill the data structures

In [None]:
def json_output_to_vector(json_output):
    split_list = json_output.strip('][').split(',')
    ret_value = []
    for item in split_list:
        ret_value.append(float(item))
    return ret_value

def aid_to_label(json_id):
    return int(json_id)

#read in json data, split into features and responses, and append to given lists X and y
def fill_Xy(filepath, X, y):
    import json
    
    for line in open(filepath, "r"):
        data = json.loads(line)
        X.append(json_output_to_vector(data['output']))
        y.append(aid_to_label(data["author_id"]))

Training sets are far too large for loading all at once, fitting must be done in batches

In [None]:
#split the given sequence into segments of given length
def split_by(sequence, length):
    total_len = len(sequence)
    total_seen = 0
    iterable = iter(sequence)
    def yield_length():
        for i in range(length):
            if total_seen < total_len:
                total_seen += 1
                yield next(iterable)
            else:
                return
    while True:
        res = list(yield_length())
        if not res:
            return
        yield res

Generate a random forest classifier for each batch

In [None]:
#train a random forest classifier on a subset of the data
def generate_rf(X, y):
    rf = RandomForestClassifier(n_estimators=100, max_features=None)
    rf.fit(X, y)
    return rf

Combine classifiers

In [None]:
#combine two random forest classifiers
def combine_rfs(rf_a, rf_b):
    rf_a.estimators_ += rf_b.estimators_
    rf_a.n_estimators = len(rf_a.estimators_)
    return rf_a

Put it together

In [None]:
import functools

#train classifiers on batches of data and combine them
def generate_meta_rf(X, y, batch_size):
    rfs = [generate_rf(X_subset, y_subset) 
           for X_subset, y_subset 
           in zip([X[x:x+batch_size] for x in range(0,len(X),batch_size)],
                  [y[x:x+batch_size] for x in range(0,len(y),batch_size)])]
    combined = functools.reduce(combine_rfs, rfs)
    return combined

Prepare data

In [None]:
fill_Xy(train_filepath, X_train, y_train)
fill_Xy(test_filepath, X_test, y_test)

Create the meta-model

In [None]:
author_attribution = generate_meta_rf(X_train, y_train, 1000)

Score on the test set

In [None]:
total_tests = 0
total_correct = 0
predictions = author_attribution.predict(X_test)
for x, y in zip(X_test, y_test):
    test = []
    test.append(x)
    pred = author_attribution.predict(X_test)
    
    total_tests += 1
    if pred[0] == y:
        total_correct += 1

results = total_correct / total_tests
print(results)