# Classifier

In [1]:
from sklearn.ensemble import RandomForestClassifier

X_train = []
y_train = []
X_test = []
y_test = []
train_filepath = "doc2vec_outputs/inferred_training_unigram_words.json"
test_filepath = "doc2vec_outputs/inferred_testing_unigram_words.json"

Fill the data structures

In [2]:
#read in json data, split into features and responses, and append to given lists X and y
def fill_Xy(filepath, X, y):
    import json
    
    for line in open(filepath, "r"):
        data = json.loads(line)
        X.append(list(json.loads(data["output"])))
        y.append(data["author_id"])

Training sets are far too large for loading all at once, fitting must be done in batches

In [3]:
#split the given sequence into segments of given length
def split_by(sequence, length):
    iterable = iter(sequence)
    def yield_length():
        for i in range(length):
             yield next(iterable)
    while True:
        res = list(yield_length())
        if not res:
            return
        yield res

Generate a random forest classifier for each batch

In [4]:
#train a random forest classifier on a subset of the data
def generate_rf(X, y):
    rf = RandomForestClassifier(n_estimators=100, max_features=None)
    rf.fit(X, y)
    return rf

Combine classifiers

In [5]:
#combine two random forest classifiers
def combine_rfs(rf_a, rf_b):
    rf_a.estimators_ += rf_b.estimators_
    rf_a.n_estimators = len(rf_a.estimators_)
    return rf_a

Put it together

In [6]:
#train classifiers on batches of data and combine them
def generate_meta_rf(X, y, batch_size):
    rfs = [generate_rf(X_subset, y_subset) 
           for X_subset, y_subset 
           in zip(split_by(X, batch_size), split_by(y, batch_size))]
    combined = reduce(combine_rfs, rfs)
    return combined

Prepare data

In [7]:
fill_Xy(train_filepath, X_train, y_train)
fill_Xy(test_filepath, X_test, y_test)

Create the meta-model

In [8]:
author_attribution = generate_meta_rf(X_train, y_train, 1000)

RuntimeError: generator raised StopIteration

Score on the test set

In [None]:
results = author_attribution.score(X_test, y_test)
print(results)