# Project Ruby - Baseline

## Goals: 
1. Predict how many comments will follow a given comment.
2. Generate comments that will generate most children comments.

In [1]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id']
    subreddits = ['news']
    comments_size = 100000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[0])
    return comments

comments = read_dataset()

Example Comment:
{'body': "'Cause your dick might hit a butt and make baby Jesus cry. ", 'author': 'Number6isNo1', 'subreddit': 'news', 'id': 'dbumnrk', 'parent_id': 't1_dbumi82', 'score': 26, 'controversiality': 0, 'link_id': 't3_5lbkgm'}


# Baseline 1: Logistic Regression with Unigram Features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

def build_vocabulary(comments):
    vocabulary = set()
    for comment in comments:
        for word in comment:
            vocabulary.add(word)
    return vocabulary

def prepare_train(comments, vocabulary, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    vocabulary = list(vocabulary)
    cv = CountVectorizer(vocabulary=vocabulary)
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        train_X.append(comment['body'])
        train_Y.append(comment['score'])
    train_X = cv.fit_transform(train_X)

    for comment in test:
        test_X.append(comment['body'])
        test_Y.append(comment['score'])
    test_X = cv.fit_transform(test_X)
    
    return train_X, train_Y, test_X, test_Y

vocabulary = build_vocabulary(comments)
train_X, train_Y, test_X, test_Y = prepare_train(comments, vocabulary, train_size=0.9)

In [3]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import numpy as np

lr = LinearRegression()
lr = lr.fit(train_X, train_Y)
print (lr.score(train_X, train_Y))
prediction = lr.predict(test_X)
# prediction = np.zeros(len(test_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

6.3401532695e-06
21.3334794347
34704.3763216


# Baseline 2: Neural Network Regression with GloVe

In [4]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.txt'
word2vec_output_file = 'word2vec.txt'
model = glove2word2vec(glove_input_file, word2vec_output_file)

from gensim.models import KeyedVectors
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.775162398815155)]


In [11]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
        sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['score'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
        sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['score'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [12]:
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

0.0713716073349
0.219620100935
0.233813820107


# Baseline 3: Predicting Number of Child Comments
with Neural Network and GloVe Vectors

In [4]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.txt'
word2vec_output_file = 'word2vec.txt'
model = glove2word2vec(glove_input_file, word2vec_output_file)

from gensim.models import KeyedVectors
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.775162398815155)]


In [32]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id', "num_child_comments"]
    subreddits = ['news']
    comments_size = 100000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month + "A"
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[1])
    return comments

comments = read_dataset()

Example Comment:
{'body': 'I think it was more for the fact that they were saying to do it, in order to save money.', 'author': 'potrg801', 'subreddit': 'news', 'id': 'dbumnt2', 'parent_id': 't1_dbtt6ak', 'score': 1, 'controversiality': 0, 'link_id': 't3_5l77ad', 'num_child_comments': 0}


In [33]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['num_child_comments'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['num_child_comments'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [34]:
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
# prediction = np.zeros(len(test_X))
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

-0.34002538221
0.284969198496
1.11329125027


# Baseline 5: Predicting Number of Child Comments
with Neural Network and GloVe Vectors + Exclude comments with zero child comments

In [35]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id', "num_child_comments"]
    subreddits = ['news']
    comments_size = 10000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month + "A"
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits and comment_data['num_child_comments'] > 0:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[1])
    return comments

comments = read_dataset()

Example Comment:
{'body': '[removed]', 'author': '[deleted]', 'subreddit': 'news', 'id': 'dbumpip', 'parent_id': 't3_5l5aew', 'score': 1, 'controversiality': 0, 'link_id': 't3_5l5aew', 'num_child_comments': 1}


In [36]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['num_child_comments'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['num_child_comments'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [37]:
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

-0.00064974642348
7.65527630524
2218.28397051


# Baseline: Neural Network with GloVe
Multiclass Classification on 2016 Reddit Comments from r/news

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

def evaluate(train_X, train_Y, dev_X, dev_Y, parameters=None):
    if parameters == None:
        activations = ['relu', 'logistic', 'tanh']
        solver = ['sgd', 'adam', 'lbfgs']
        randoms = [5, 10, 15]
        hidden_layers = [1, 2, 4]
    else:
        a, s, r, h = parameters
        activations = [a]
        solver = [s]
        randoms = [r]
        hidden_layers = [h]
#     hn = int(len(train_X) / (float(5) * train_X[0].shape[0]))
    hn = 10
    print ("Number of Hidden Neurons per layer = {}".format(hn))
    maxa, maxs, maxr, maxh, maxacc = 0, 0, 0, 0, 0
    for a in activations:
        for s in solver:
            for r in randoms:
                for h in hidden_layers:
                    print("For {}, {}, {}, {}:".format(a, s, r, h))
                    nn = MLPClassifier(activation=a, solver=s, alpha=1e-5, 
                                       hidden_layer_sizes=(hn,)*h, random_state=r,
                                       tol=0.0001, batch_size='auto', max_iter=1000,
                                       beta_1=0.9, beta_2=0.999)
                    nn.fit(train_X, train_Y)
                    prediction = nn.predict(dev_X)
                    train_acc = nn.score(train_X, train_Y)
                    dev_acc = metrics.accuracy_score(dev_Y, prediction)
                    print("Train Accuracy: {}".format(train_acc))
                    print("Dev Accuracy: {}".format(dev_acc))
                    if maxacc < dev_acc:
                        maxacc = dev_acc
                        maxa, maxs, maxr, maxh = a, s, r, h
    print("Max Performance is {} with {}, {}, {}, {}".format(maxacc,maxa,maxs,maxr,maxh))


    for i in range(0,6):
        p_count = 0
        a_count = 0
        c_count = 0
        w_count = 0
        for j in range(len(prediction)):
            if prediction[j] == i: p_count += 1
            if dev_Y[j] == i: a_count += 1
            if dev_Y[j] == i and prediction[j] == i: c_count += 1
            if dev_Y[j] == i and prediction[j] != i: w_count += 1
        print('Bucket ' + str(i) + ': Prediction = ' + str(p_count))
        print('Bucket ' + str(i) + ': Answer = ' + str(a_count))
        print('Bucket ' + str(i) + ': Got Correct = ' + str(c_count))
        print('Bucket ' + str(i) + ': Got Wrong = ' + str(w_count))

print("Evaluate Function loaded.")

Evaluate Function loaded.


In [2]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

print("Loading Glove in word2vec format...")
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print("Glove is loaded.")
print(model['test'].shape)
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

Loading Glove in word2vec format...
Glove is loaded.
(300,)
[('queen', 0.775162398815155)]


## Five Buckets with True Distribution

In [8]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                sentence_vector = np.zeros(model['test'].shape)
                count = 0
                for word in comment['body'].split():
                    word = word.lower()
                    regex = re.compile('[^a-zA-Z]')
                    word = regex.sub('',word)
                    if word in model.vocab:
                        count += 1
                        sentence_vector += model[word]
                if count != 0: sentence_vector /= count
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                elif comment['num_child_comments'] >= 1 and comment['num_child_comments'] <= 2:
                    bucket = 2
                elif comment['num_child_comments'] >= 3 and comment['num_child_comments'] <= 6:
                    bucket = 3
                elif comment['num_child_comments'] >= 7 and comment['num_child_comments'] <= 14:
                    bucket = 4
                elif comment['num_child_comments'] >= 15:
                    bucket = 5
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    X.append(sentence_vector)
                    Y.append(bucket)
    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 100000, 100000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 10000, 10000)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
Loaded Reddit Comments
Number of Hidden Neurons per layer = 66
For relu, sgd, 5, 1:
Train Accuracy: 0.46876
Dev Accuracy: 0.4712
For relu, sgd, 5, 2:
Train Accuracy: 0.46883
Dev Accuracy: 0.4711
For relu, sgd, 5, 4:
Train Accuracy: 0.47015
Dev Accuracy: 0.4694
For relu, sgd, 10, 1:
Train Accuracy: 0.46865
Dev Accuracy: 0.471
For relu, sgd, 10, 2:
Train Accuracy: 0.47006
Dev Accuracy: 0.4681
For relu, sgd, 10, 4:
Train Accuracy: 0.46955
Dev Accuracy: 0.4681
For relu, sgd, 15, 1:
Train Accuracy: 0.46878
Dev Accuracy: 0.4716
For relu, sgd, 15, 2:
Train Accuracy: 0.46898
Dev Accuracy: 0.4703
For relu, sgd, 15, 4:
Train Accuracy: 0.46888
Dev Accuracy: 0.4691
For relu, adam, 5, 1:
Train Accuracy: 0.5213
Dev Accuracy: 0.4262
For relu, adam, 5, 2:
Train Accuracy: 0.56882
Dev Accuracy: 0.3954
For relu, adam, 5, 4:
Train Accuracy: 0.56167
Dev Accuracy: 0.419
For relu, adam, 10, 1:
Train Accuracy: 0.52628
Dev Accuracy: 0.4229
For relu, adam, 10, 2:
Train Accuracy: 0.56501


## Five Buckets with Normalized Distribution

In [9]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                sentence_vector = np.zeros(model['test'].shape)
                count = 0
                for word in comment['body'].split():
                    word = word.lower()
                    regex = re.compile('[^a-zA-Z]')
                    word = regex.sub('',word)
                    if word in model.vocab:
                        count += 1
                        sentence_vector += model[word]
                if count != 0: sentence_vector /= count
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                elif comment['num_child_comments'] >= 1 and comment['num_child_comments'] <= 2:
                    bucket = 2
                elif comment['num_child_comments'] >= 3 and comment['num_child_comments'] <= 6:
                    bucket = 3
                elif comment['num_child_comments'] >= 7 and comment['num_child_comments'] <= 14:
                    bucket = 4
                elif comment['num_child_comments'] >= 15:
                    bucket = 5
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    X.append(sentence_vector)
                    Y.append(bucket)
    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 100000, 20000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 10000, 2000)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
Loaded Reddit Comments
Number of Hidden Neurons per layer = 66
For relu, sgd, 5, 1:
Train Accuracy: 0.28721
Dev Accuracy: 0.2761
For relu, sgd, 5, 2:
Train Accuracy: 0.29561
Dev Accuracy: 0.2812
For relu, sgd, 5, 4:
Train Accuracy: 0.29234
Dev Accuracy: 0.2803
For relu, sgd, 10, 1:
Train Accuracy: 0.28767
Dev Accuracy: 0.2829
For relu, sgd, 10, 2:
Train Accuracy: 0.29603
Dev Accuracy: 0.2804
For relu, sgd, 10, 4:
Train Accuracy: 0.35018
Dev Accuracy: 0.2732
For relu, sgd, 15, 1:
Train Accuracy: 0.28647
Dev Accuracy: 0.2761
For relu, sgd, 15, 2:
Train Accuracy: 0.29343
Dev Accuracy: 0.2821
For relu, sgd, 15, 4:
Train Accuracy: 0.35696
Dev Accuracy: 0.2684
For relu, adam, 5, 1:
Train Accuracy: 0.39418
Dev Accuracy: 0.2605
For relu, adam, 5, 2:
Train Accuracy: 0.45469
Dev Accuracy: 0.2456
For relu, adam, 5, 4:
Train Accuracy: 0.46246
Dev Accuracy: 0.2484
For relu, adam, 10, 1:
Train Accuracy: 0.38983
Dev Accuracy: 0.2618
For relu, adam, 10, 2:
Train Accuracy: 0.446

## Two Buckets (Zero or Not Zero)

In [11]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                sentence_vector = np.zeros(model['test'].shape)
                count = 0
                for word in comment['body'].split():
                    word = word.lower()
                    regex = re.compile('[^a-zA-Z]')
                    word = regex.sub('',word)
                    if word in model.vocab:
                        count += 1
                        sentence_vector += model[word]
                if count != 0: sentence_vector /= count
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                else:
                    bucket = 0
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    X.append(sentence_vector)
                    Y.append(bucket)
    return X, Y

print("Loading Reddit Comments")
# train_X, train_Y = load_comments('redditnews/RedditTrain', 10000, 5000)
# dev_X, dev_Y = load_comments('redditnews/RedditDev', 1000, 500)
print("Loaded Reddit Comments")
parameters = ('logistic', 'lbfgs', 10, 1) #a, s, r, h
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
Loaded Reddit Comments
Number of Hidden Neurons per layer = 6
For relu, sgd, 5, 1:
Train Accuracy: 0.5588
Dev Accuracy: 0.54
For relu, sgd, 5, 2:
Train Accuracy: 0.5179
Dev Accuracy: 0.523
For relu, sgd, 5, 4:
Train Accuracy: 0.5134
Dev Accuracy: 0.534
For relu, sgd, 10, 1:
Train Accuracy: 0.5631
Dev Accuracy: 0.547
For relu, sgd, 10, 2:
Train Accuracy: 0.5671
Dev Accuracy: 0.581
For relu, sgd, 10, 4:
Train Accuracy: 0.4904
Dev Accuracy: 0.491
For relu, sgd, 15, 1:
Train Accuracy: 0.5066
Dev Accuracy: 0.529
For relu, sgd, 15, 2:
Train Accuracy: 0.5031
Dev Accuracy: 0.508
For relu, sgd, 15, 4:
Train Accuracy: 0.5075
Dev Accuracy: 0.521
For relu, adam, 5, 1:
Train Accuracy: 0.6487
Dev Accuracy: 0.562
For relu, adam, 5, 2:
Train Accuracy: 0.6465
Dev Accuracy: 0.559
For relu, adam, 5, 4:
Train Accuracy: 0.6844
Dev Accuracy: 0.575
For relu, adam, 10, 1:
Train Accuracy: 0.6389
Dev Accuracy: 0.55
For relu, adam, 10, 2:
Train Accuracy: 0.6574
Dev Accuracy: 0.553
For rel

## Two Buckets without Sentence Vector Normalization

In [5]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                sentence_vector = np.zeros(model['test'].shape)
                count = 0
                for word in comment['body'].split():
                    word = word.lower()
                    regex = re.compile('[^a-zA-Z]')
                    word = regex.sub('',word)
                    if word in model.vocab:
                        count += 1
                        sentence_vector += model[word]
#                 if count != 0: sentence_vector /= count
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                else:
                    bucket = 0
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    X.append(sentence_vector)
                    Y.append(bucket)
    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 100000, 50000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 10000, 5000)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
Loaded Reddit Comments
Number of Hidden Neurons per layer = 500
For relu, sgd, 10, 2:
Train Accuracy: 0.8901
Dev Accuracy: 0.545
Max Performance is 0.545 with relu, sgd, 10, 2
Bucket 0: Prediction = 4516
Bucket 0: Answer = 5000
Bucket 0: Got Correct = 2483
Bucket 0: Got Wrong = 2517
Bucket 1: Prediction = 5484
Bucket 1: Answer = 5000
Bucket 1: Got Correct = 2967
Bucket 1: Got Wrong = 2033
Bucket 2: Prediction = 0
Bucket 2: Answer = 0
Bucket 2: Got Correct = 0
Bucket 2: Got Wrong = 0
Bucket 3: Prediction = 0
Bucket 3: Answer = 0
Bucket 3: Got Correct = 0
Bucket 3: Got Wrong = 0
Bucket 4: Prediction = 0
Bucket 4: Answer = 0
Bucket 4: Got Correct = 0
Bucket 4: Got Wrong = 0
Bucket 5: Prediction = 0
Bucket 5: Answer = 0
Bucket 5: Got Correct = 0
Bucket 5: Got Wrong = 0
0.5


## Normalized Five Buckets without Sentence Vector Normlization

In [None]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                sentence_vector = np.zeros(model['test'].shape)
                count = 0
                for word in comment['body'].split():
                    word = word.lower()
                    regex = re.compile('[^a-zA-Z]')
                    word = regex.sub('',word)
                    if word in model.vocab:
                        count += 1
                        sentence_vector += model[word]
#                 if count != 0: sentence_vector /= count
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                elif comment['num_child_comments'] >= 1 and comment['num_child_comments'] <= 2:
                    bucket = 2
                elif comment['num_child_comments'] >= 3 and comment['num_child_comments'] <= 6:
                    bucket = 3
                elif comment['num_child_comments'] >= 7 and comment['num_child_comments'] <= 14:
                    bucket = 4
                elif comment['num_child_comments'] >= 15:
                    bucket = 5
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    X.append(sentence_vector)
                    Y.append(bucket)
    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 1500000, 300000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 150000, 30000)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

# New Features
## Human Selected Basic Features

In [8]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    features = ["you", "?", "who", "what", "where", "when", "why", "right", "wrong", "think", "seriously", "know"]
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                feature = np.zeros(len(features)+1, dtype=int)
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                elif comment['num_child_comments'] >= 1 and comment['num_child_comments'] <= 2:
                    bucket = 2
                elif comment['num_child_comments'] >= 3 and comment['num_child_comments'] <= 6:
                    bucket = 3
                elif comment['num_child_comments'] >= 7 and comment['num_child_comments'] <= 14:
                    bucket = 4
                elif comment['num_child_comments'] >= 15:
                    bucket = 5
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    for word in comment['body'].split():
                        word = word.lower()
                        regex = re.compile('[^a-zA-Z?]')
                        word = regex.sub('',word)
                        for i in range(len(features)):
                            if features[i] in word:
                                feature[i] += 1
                    feature[len(feature)-1] = len(comment['body'].split())
                    print(feature)
                    X.append(feature)
                    Y.append(bucket)

    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 15000, 3000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 1500, 300)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 1  0  0  2  0  0  0  0  0  0  0  0 10]
[ 2  0  1  0  0  0  0  1  1  0  0  0 39]
[ 0  0  0  1  0  0  0  0  0  0  0  0 12]
[  46   15    8   11    4    3    3    6    4    7    0   10 1541]
[ 0  0  0  0  0  1  0  0  0  0  0  0 57]
[  9   0   4   1   0   1   0   5   0   0   0   1 267]
[ 1  1  0  0  0  0  0  1  0  0  0  0 11]
[ 1  0  0  0  0  0  0  0  0  0  0  1 15]
[ 0  1  0  0  0  0  1  0  0  0  0  0 13]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 0  0  0  0  0  1  0  0  0  0  0  0 26]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[0 2 0 0 0 0 0 0 0 0 1 0 8]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 42]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[  6   5   0   0   0   0   2   0   0   0   0   2 122]
[ 0  1  0  0  0  0  0  1  0  0  0  0 32]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[ 3  1  0  0  0  0  0  0  1  0  0  0 21]
[  5   1   4   2   0   0   1   0   0   0   0   0 109]
[ 0  

[ 1  0  0  1  0  1  1  1  0  0  0  0 73]
[ 1  0  0  0  0  0  0  0  0  0  0  0 51]
[ 1  0  1  0  0  0  0  0  0  0  0  1 59]
[ 3  1  0  2  0  0  0  1  0  0  0  0 67]
[1 0 0 0 0 0 1 0 0 0 0 0 7]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 0  1  0  0  0  0  0  0  0  0  0  0 16]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 3  1  0  0  0  0  0  1  0  0  0  0 45]
[ 0  0  0  0  0  0  0  1  0  0  0  0 33]
[ 0  0  0  0  1  0  0  0  0  0  0  0 34]
[ 0  0  0  0  0  0  0  0  0  0  0  0 26]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[0 0 0 1 0 0 0 0 0 0 0 0 8]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 0  0  0  0  0  0  0  0  0  0  0  1 40]
[1 1 0 0 0 0 0 0 0 0 0 0 9]
[ 0  0  0  0  0  0  0  0  0  1  0  0 33]
[ 1  0  0  0  0  0  0  0  0  0  0  1 23]
[ 0  0  0  0  0  0  0  0  0  0  0  0 14]
[ 1  1  0  0  0  0  0  0  0  1  0  0 15]
[ 0  0  0  0  0  0  0  0  0  0  0  0 33]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 2  0  0  0  0  0  0  0  0  0  0  0 15]
[0 0 0 0 0 0 0 0 0 0 0 0 

[0 0 0 0 0 0 0 0 0 0 0 0 3]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[  2   0   1   0   0   0   0   0   0   0   0   0 145]
[  0   0   0   0   0   0   0   0   0   1   0   0 101]
[ 0  0  0  0  0  0  0  0  0  0  0  3 51]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 0  2  0  0  0  0  1  0  0  0  0  0 15]
[ 0  2  0  0  0  0  0  0  0  0  0  0 36]
[  1   0   0   2   0   0   0   0   0   0   0   0 112]
[ 2  0  0  0  0  0  0  2  0  0  0  0 15]
[ 1  1  0  0  0  0  0  0  0  0  0  0 18]
[ 0  0  1  0  0  0  0  0  0  0  0  0 50]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 0  0  0  0  0  0  0  0  0  0  0  0 71]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[1 0 0 0 0 0 0 0 0 0 0 1 9]
[ 0  0  0  0  0  0  0  0  0  0  0  0 19]
[ 0  2  0  1  0  0  0  1  0  0  0  0 24]
[ 2  1  0  0  0  0  0  0  0  0  0  0 12]
[ 1  0  0  0  0  0  0  0  0  0  0  0 14]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[ 0  0  0  1  0  0  0  0  0  0  0  4 97]
[ 0  0  0  0  0  0  0  0  0  0  0  0 49]

[2 0 0 0 0 0 0 0 0 0 0 0 9]
[1 1 0 0 0 0 0 0 0 1 0 0 9]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[ 2  0  0  0  1  0  0  0  0  0  0  0 35]
[ 4  0  0  0  0  0  0  1  0  0  0  0 42]
[ 2  0  0  0  0  0  0  0  0  0  0  0 33]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[ 0  0  0  0  1  0  0  0  0  0  0  0 63]
[ 0  1  0  0  0  0  0  0  0  0  0  0 10]
[ 0  0  0  0  0  0  0  0  0  1  0  0 77]
[ 1  0  0  0  0  0  0  1  0  0  0  0 45]
[ 0  0  0  0  1  0  0  0  0  0  0  0 17]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 2  0  0  1  0  0  0  0  0  0  0  0 11]
[ 3  0  0  0  0  0  0  0  0  0  0  1 32]
[ 1  0  0  0  0  0  0  0  0  0  0  0 16]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 1  0  0  0  0  0  0  0  1  0  0  0 31]
[ 3  0  2  0  0  0  0  0  0  0  0  0 25]
[ 0  0  0  0  0  0  0  0  0  0  0  0 23]
[ 1  0  0  1  0  0  0  0  0  0  0  0 35]
[ 0  2  0  0  0  0  0  0  0  0  0  0 90]
[ 1  0  2  0  0  0  0  0  0  0  0  0 50]
[ 1  0  0  0  0  0  0  0  0  0  0  0 13]
[ 2  0  0  0  0  0  0  0  0  0  0  1 21]
[  8   0   2   1   0   0   

[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[0 1 0 0 0 0 0 0 0 0 0 0 7]
[0 1 0 1 0 0 0 0 1 0 0 0 8]
[ 2  0  0  0  0  0  0  0  0  0  0  1 23]
[ 1  0  0  0  0  0  0  0  0  0  0  0 10]
[ 0  0  0  0  0  0  0  0  0  0  0  0 25]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 2  0  1  0  0  0  0  0  0  0  0  0 60]
[ 1  0  1  0  0  0  0  1  0  0  0  0 46]
[ 2  0  0  0  0  1  0  0  0  0  0  0 64]
[ 1  0  0  0  0  0  0  0  0  0  0  0 27]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 1  0  0  0  0  0  1  1  0  0  0  1 29]
[ 0  0  0  0  0  0  0  0  0  0  0  0 26]
[ 0  0  0  0  0  0  0  0  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 15]
[ 0  0  0  0  0  0  0  1  0  0  0  0 11]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[3 0 0 0 0 0 0 0 0 0 0 0 9]
[ 2  0  0  0  0  0  0  0  0  0  0  0 82]
[ 0  0  0  0  0  0  0  0  0  0  0  0 14]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[0 0 0 0 0 0 0 0 0 1 0 0 8]
[ 0  2  1  1  0  0  0  0  0  1  0  1 81]
[  5   0   0   0   0   0   0   0   0   0   0   0 101]
[ 0  0  0  0  0  1  0  0  0  0  0  0 7

[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  1  1  3  0  0  0  0 82]
[ 4  2  0  0  0  0  0  0  0  0  0  0 25]
[ 0  0  0  0  0  0  0  0  0  0  0  0 34]
[ 0  1  0  0  0  0  0  0  0  1  0  0 89]
[ 2  0  0  1  0  0  0  0  0  0  0  0 42]
[ 1  1  1  0  0  0  0  0  0  0  0  0 13]
[ 0  1  0  0  0  1  0  0  0  0  0  0 13]
[ 0  0  0  0  0  1  0  0  0  0  0  0 35]
[ 1  0  0  0  0  0  0  0  0  1  0  0 19]
[ 5  0  1  1  0  0  1  4  0  0  0  0 70]
[ 0  0  0  0  0  0  0  0  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 0  2  0  0  0  1  0  0  0  1  0  0 37]
[ 0  2  0  0  0  0  1  0  0  0  0  0 37]
[ 5  0  1  3  0  1  0  0  0  0  0  1 75]
[ 0  0  0  0  0  0  0  0  0  0  0  1 15]
[ 3  0  1  0  0  1  0  0  0  0  0  0 93]
[ 1  0  0  1  0  0  0  0  0  2  0  0 23]
[ 0  0  0  0  0  1  0  0  0  0  0  0 11]
[ 0  0  0  0  0  0  0  0  0  1  0  1 87]
[ 0  0  0  0  0  0  0  0  0  0  0  0 19]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 1  0  0  0  0  0  0  0  0  0  0  0 22]
[

[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 3  1  0  0  0  1  0  0  0  0  0  0 33]
[ 1  3  1  0  0  0  0  3  0  1  1  1 83]
[ 1  0  0  0  0  0  0  0  2  0  0  0 92]
[ 1  0  0  0  0  0  0  0  0  0  0  0 20]
[0 1 0 1 0 0 1 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  0 46]
[ 0  0  0  0  0  0  0  0  0  0  0  0 15]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 0  0  0  0  0  0  0  0  0  0  0  0 27]
[ 0  0  0  0  0  0  0  0  0  0  0  0 30]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 0  0  0  0  1  0  0  0  0  0  0  0 50]
[ 0  0  0  0  0  0  0  0  0  0  0  0 30]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 0  0  0  0  0  0  0  0  0  0  0  0 27]
[ 1  0  0  0  0  0  0  0  0  0  0  0 29]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 0  0  0  0  0  0  0  0  0  1  0  0 10]
[ 0  0  0  0  0  0  0  0  0  0  0  0 60]
[ 2  0  0  0  0  0  1  1  0  0  0  0 47]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[1 0 0 0 0 0 0 0 0 0 0 0 9]
[1 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  

[ 1  0  0  0  0  0  0  1  0  0  0  0 25]
[ 3  1  0  1  0  1  0  0  0  1  0  0 52]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 1  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  0 33]
[ 0  0  0  0  0  0  0  0  0  0  0  0 98]
[ 0  0  1  0  0  0  0  0  0  0  0  1 10]
[1 1 0 0 0 0 0 0 0 0 0 0 6]
[ 0  0  0  0  0  0  0  0  0  0  0  0 31]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[1 0 0 0 0 0 0 0 0 0 0 0 9]
[ 0  0  0  0  1  1  0  0  0  0  0  0 69]
[1 1 0 0 0 0 0 0 0 0 0 0 7]
[  7   0   0   1   0   0   1   1   1   0   0   0 258]
[0 1 0 0 0 0 0 0 0 0 0 0 3]
[ 2  1  0  1  0  0  1  2  0  0  0  1 74]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[ 3  1  0  0  0  0  0  0  0  0  0  0 70]
[ 0  1  0  0  0  0  0  0  0  0  0  0 10]
[ 1  1  0  1  0  0  0  0  0  0  0  0 18]
[0 1 1 0 0 0 0 0 0 0 0 0 4]
[ 3  0  0  0  0  0  0  0  0  0  0  0 31]
[0 1 0 0 0 0 0 0 0 0 0 0 7]
[0 1 0 0 0 0 0 0 0 0 0 0 4]
[ 0  0  0  0  0  0  0  0  0  0  0  1 28]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[0 0 0 0 0

[ 0  0  0  0  0  0  0  0  0  0  0  0 15]
[ 0  1  0  1  0  0  0  0  0  0  0  0 15]
[ 1  0  0  0  0  0  0  1  0  0  0  0 37]
[ 1  1  0  1  0  0  0  0  1  0  0  0 31]
[ 0  0  1  0  0  0  0  0  0  0  0  1 32]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 0  0  0  0  0  0  0  0  0  0  0  0 23]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[ 3  0  0  0  0  0  0  0  0  0  0  1 44]
[ 0  0  0  1  0  0  0  0  0  0  0  0 30]
[ 1  1  0  0  0  0  0  0  0  0  0  0 32]
[ 3  1  0  0  0  0  0  0  0  0  1  0 18]
[ 0  0  0  0  0  0  0  0  0  0  0  0 26]
[ 1  0  0  0  0  0  0  0  0  0  0  0 66]
[ 1  0  0  0  2  0  0  2  0  0  0  0 64]
[  1   1   0   1   0   1   0   1   0   1   0   0 122]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[  8   4   0   0   0   1   0   0   0   1   0   3 148]
[ 0  0  0  0  0  0  0  0  0  0  0  0 23]
[ 3  0  1  0  0  0  0  0  0  0  0  0 40]
[ 1  0  0  0  0  0  0  0  0  0  0  0 10]
[ 1  0  0  0  0  0  0  0  0  0  0  0 20]
[ 1  0  1  0  0  0  0  0  0  0  0  0 27]
[ 1  0  0  0  0  0  1  0  0  0  0  0 65]
[ 1  0  0  0  0 

[ 0  0  0  0  0  0  0  0  0  0  0  0 27]
[ 2  0  0  0  0  0  0  0  0  0  0  0 22]
[  4   0   0   1   1   1   0   0   1   0   0   0 136]
[ 0  0  0  0  0  0  0  1  0  0  0  0 24]
[ 0  0  0  0  0  0  0  0  0  0  0  0 32]
[1 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 35]
[  0   0   0   1   0   0   0   0   0   1   0   1 125]
[  7   3   0   0   0   0   0   3   0   0   0   1 351]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 3  0  0  1  0  0  0  0  0  0  0  0 45]
[ 0  1  0  0  0  0  0  0  0  0  0  0 15]
[ 1  2  1  1  0  0  0  0  0  0  0  0 65]
[ 0  0  0  0  0  0  0  0  0  0  0  0 14]
[ 0  0  0  0  0  0  0  0  0  0  0  0 40]
[ 3  0  0  0  0  0  0  0  0  0  0  0 22]
[1 1 0 1 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 60]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 1  0  0  0  0  0  0  0  0  0  0  0 51]
[ 2  1  0  0  0  0  0  1  0  0  0  0 23]
[ 0  0  0  0  0  0  0  0  0  0  0  0 16]
[ 1  1  0  0  0 

[ 0  0  0  0  0  0  0  0  0  1  0  0 10]
[ 2  0  0  0  1  0  0  0  0  0  0  0 28]
[  0   0   0   1   0   0   0   0   0   0   0   0 166]
[ 2  0  0  0  0  0  0  0  0  0  0  0 24]
[ 1  0  0  0  0  0  0  0  0  0  0  0 15]
[ 0  0  0  0  0  0  0  0  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 29]
[ 1  2  1  0  0  0  0  0  0  1  0  0 66]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 1  0  0  0  0  0  0  0  0  0  0  0 45]
[ 0  0  0  0  0  0  0  0  0  0  0  0 44]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[0 0 0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 0 1 0 0 0 0 8]
[0 1 0 0 0 0 0 0 0 0 0 0 6]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[ 1  1  0  0  0  1  0  0  0  0  0  0 54]
[ 0  0  0  0  0  1  0  0  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 19]
[ 3  0  1  1  0  0  1  0  0  0  0  0 26]
[ 0  0  0  0  0  0  0  0  0  0  0  0 44]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[ 1  0  0  1  0  0  0  0  0  0  0  0 15]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]

[1 1 0 0 0 0 0 0 0 0 0 0 5]
[ 2  0  0  1  0  0  0  0  0  0  0  0 51]
[ 0  0  1  0  0  0  0  1  0  0  0  0 34]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[ 0  0  0  0  0  0  0  0  0  0  0  0 48]
[ 0  0  1  0  0  0  0  0  0  0  0  1 30]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[  0   0   0   0   0   0   0   0   0   0   0   0 115]
[0 0 1 0 0 0 0 0 0 1 0 1 7]
[ 1  0  0  0  0  0  0  0  0  0  0  0 28]
[ 1  0  0  0  0  0  0  0  1  0  0  0 26]
[ 2  1  0  1  0  1  0  0  0  0  0  0 18]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[ 0  0  0  0  1  0  0  0  0  0  0  0 50]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  1  0  0  0  0  0  1  0  1  0  0 27]
[ 2  0  0  2  0  0  0  0  0  0  0  0 20]
[ 1  1  0  0  1  0  0  0  0  0  0  0 16]
[ 2  2  0  0  0  0  1  0  0  0  0  0 23]
[ 0  0  0  0  0  0  0  2  0  0  0  0 57]
[  2   5   0   0   0   0   0   0   1   0   0   0 121]
[ 1  1  1  0  0  0  1  1  0  0  0  0 29]
[ 1  0  1  1  0  0  1  0  0  0  0  0 63]
[ 1  0  0  0  0  0  0  0  0  0  0  0 12]
[ 0  0  0  0  

[ 1  1  0  0  0  0  0  0  0  0  0  0 15]
[ 0  0  0  0  0  0  0  0  0  0  0  0 33]
[ 2  1  0  0  0  0  0  0  0  0  0  0 29]
[ 2  4  0  2  0  0  0  1  0  0  0  0 35]
[ 0  1  0  0  0  0  0  0  0  0  0  0 16]
[ 2  2  0  1  0  0  0  0  0  0  0  0 24]
[ 0  0  0  0  0  1  0  0  0  0  0  0 26]
[0 1 0 0 0 0 0 0 0 0 0 0 6]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[ 1  0  0  0  0  0  0  1  0  0  0  0 24]
[ 0  1  0  0  0  1  1  0  0  0  0  0 88]
[ 0  0  0  1  0  0  0  0  0  0  0  0 37]
[ 2  1  0  1  0  0  0  0  0  0  0  0 68]
[ 0  0  1  0  0  0  0  1  0  0  0  0 55]
[ 2  0  0  0  0  0  0  0  0  0  0  1 15]
[ 0  0  0  0  0  0  0  0  0  0  0  0 47]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 0  0  1  0  0  0  0  0  0  0  1  0 32]
[ 3  1  0  1  0  0  0  0  0  0  0  0 30]
[ 1  0  0  1  0  1  0  0  0  0  0  0 38]
[ 0  0  0  0  0  0  0  0  0  1  0  0 14]
[ 3  2  0  0  0  0  0  0  0  0  0  0 14]
[0 1 0 0 0 0 0 0 0 0 0 0 5]
[ 0  0  0  1  0  0  0  0  0  0  0  0 40]
[1 0 0 0 0 0 0 1 0 0 0 0 5]
[ 3  0  0  0  0  1  0  0  0

[ 2  0  0  0  0  0  0  0  0  0  0  0 42]
[ 0  0  1  0  0  0  0  0  0  0  0  0 33]
[ 0  0  0  0  0  0  0  1  0  0  0  0 42]
[  2   1   0   1   0   0   0   0   0   0   0   0 112]
[ 0  0  0  0  0  0  0  0  0  0  0  0 56]
[ 0  0  0  0  0  0  0  0  0  0  0  0 27]
[ 0  0  0  0  0  0  0  0  0  0  0  0 34]
[ 1  0  0  0  1  0  0  0  0  0  0  0 70]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  1 11]
[ 0  0  0  1  0  0  0  0  0  0  0  0 33]
[ 2  0  0  0  0  0  0  0  0  0  0  0 29]
[ 1  0  0  1  0  0  0  0  0  1  0  0 35]
[  6   9   6   3   0   0   1   1   0   2   1   0 292]
[ 0  0  0  0  0  0  0  0  0  0  0  0 17]
[ 0  0  0  0  0  0  0  0  0  0  0  0 17]
[ 0  1  0  0  0  0  0  0  0  0  0  0 14]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[ 2  0  0  0  0  0  0  0  0  0  0  0 31]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  0 24]
[ 3  0  0  0  0  0  0  0  0  0  0  0 28]
[  6   2   0   1   1   1   0   0   0   1   0   0 103]
[1 0 0 0 0 0 0 0 0 0 0 0 8]
[ 0

[ 2  0  0  0  0  0  0  1  0  0  0  0 75]
[  0   0   1   0   0   0   0   0   0   1   0   0 132]
[1 1 0 0 0 0 0 0 0 0 0 0 6]
[ 0  1  0  2  0  0  0  0  0  0  0  0 27]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 1  0  0  0  0  0  0  0  0  0  0  0 28]
[ 0  0  1  0  0  0  0  0  0  1  0  0 23]
[1 0 0 0 0 0 0 0 1 0 0 0 8]
[ 0  1  0  0  0  0  1  1  0  0  0  0 33]
[ 0  0  0  1  0  0  1  0  0  0  0  0 73]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 1  1  0  0  0  0  0  0  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 24]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[ 4  2  0  0  0  0  0  0  0  0  0  0 18]
[ 0  1  0  0  0  0  0  0  0  0  0  2 35]
[ 0  1  0  0  0  0  0  0  0  0  0  0 35]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[ 1  0  0  0  0  0  0  0  0  0  0  0 10]
[ 2  0  0  0  0  0  0  0  0  0  0  0 24]
[ 1  1  0  0  0  0  0  0  0  0  0  0 25]
[ 1  0  0  0  1  0  0  0  0  0  0  0 60]
[  0   0   0   0   0   1   0   0   1   0   0   0 112]
[  6   0   0   0   1   1   0   1   0   0   0   0 169]
[

[0 0 0 0 0 0 0 0 0 0 0 0 8]
[1 1 0 0 0 0 0 0 0 0 0 0 9]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[1 0 0 0 0 0 0 0 0 0 0 0 8]
[  3   1   0   0   0   0   0   0   0   1   0   0 107]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[ 0  0  0  0  0  0  0  0  0  0  0  0 33]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[ 0  0  0  0  1  0  0  0  0  0  0  0 34]
[ 2  0  0  0  0  0  0  0  0  0  0  0 35]
[  3   0   0   0   0   0   0   2   0   0   0   0 116]
[  0   0   0   0   1   0   1   0   0   0   0   1 108]
[ 3  1  0  0  0  0  0  0  0  0  0  0 94]
[ 0  0  0  0  0  1  0  0  0  0  0  0 19]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 0  0  0  0  0  0  0  0  0  0  0  0 35]
[ 0  0  1  0  0  0  0  0  0  1  0  0 60]
[1 1 0 0 0 0 0 0 0 0 0 0 6]
[ 0  0  0  0  0  0  0  0  0  1  0  0 34]
[ 0  2  0  1  0  0  1  0  0  0  0  0 18]
[0 1 0 1 0 0 0 0 0 0 0 0 2]
[ 1  1  0  0  0  0  0  0  0  0  0  0 43]
[ 3  1  0  0  0  1  0  0  0  0  0  0 18]
[ 0  0  0  0  0  0  0  0  0  0  0  0 46]
[ 0  1  0  0  0  0  0  0 

[  3   1   2   0   0   0   0   0   0   0   0   3 228]
[ 1  0  0  0  0  0  0  0  0  0  0  0 46]
[ 0  0  0  0  0  0  0  0  0  0  0  0 46]
[ 1  1  0  1  0  1  0  0  0  0  0  1 54]
[ 5  0  0  0  0  0  0  0  0  0  0  0 30]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[ 2  0  0  0  0  0  0  0  0  0  0  0 33]
[0 1 0 1 0 0 0 0 0 0 0 0 8]
[ 4  2  1  1  2  0  0  0  0  0  0  0 59]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 1  0  0  0  0  0  0  0  0  0  0  0 30]
[ 0  0  0  0  0  0  0  0  1  0  0  0 31]
[ 1  0  0  1  0  1  0  0  0  0  0  0 61]
[ 0  0  0  0  0  1  0  0  0  0  0  0 80]
[1 0 0 0 0 0 0 0 0 0 0 0 4]
[ 3  1  0  0  0  0  0  0  0  0  0  0 30]
[ 0  0  0  0  0  0  0  0  0  0  0  0 17]
[0 1 0 1 0 0 0 0 0 0 0 0 8]
[ 0  0  1  0  0  0  0  0  0  1  0  0 43]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 26]
[ 0  1  0  0  0  0  0  0  0  0  1  0 27]
[ 0  0  0  1  0  0  0  0  0  0  0  0 37]
[1 0 0 1 0 0 0 0 0 0 0 1 7]
[ 1  0  0  0  0  0  0  0  0  0  0  0 17]
[  8   0   1   1   1   1   

[0 1 0 0 0 0 1 0 0 0 0 0 5]
[ 4  0  0  0  0  0  0  0  0  0  0  0 46]
[ 0  0  0  0  0  0  0  0  0  0  0  0 30]
[ 0  2  0  0  0  0  0  1  0  1  0  1 39]
[ 2  1  0  1  0  0  0  0  0  1  0  0 19]
[ 0  2  0  0  0  0  0  0  0  0  0  0 24]
[ 1  2  0  0  0  0  0  0  0  0  0  0 34]
[ 0  0  0  0  0  0  0  0  0  0  0  0 29]
[ 0  0  0  0  0  0  1  0  0  1  0  0 70]
[ 0  0  0  0  0  0  0  0  0  0  0  0 16]
[  0   1   2   0   0   0   0   0   0   0   0   0 206]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[0 1 0 0 0 0 0 0 0 0 0 0 9]
[  0   0  10   0   0   0   0   0   0   0   0   0 411]
[ 3  0  1  0  1  0  0  0  0  1  0  0 78]
[ 0  0  0  0  0  0  0  1  0  0  0  0 15]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 6  0  1  0  0  0  0  1  0  0  0  0 46]
[ 0  1  0  0  0  0  0  0  0  0  0  0 37]
[  4   0   0   1   0   0   0   1   0   1   0   0 187]
[ 0  0  0  0  0  0  0  0  0  0  0  1 22]
[ 0  1  0  0  0  0  0  0  0  0  0  0 31]
[ 3  0  0  1  0  0  0  0  0  0  0  0 54]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 0  0  0  0  0 

[ 2  0  0  0  0  0  0  0  0  1  0  0 38]
[ 0  0  0  0  0  0  0  0  0  0  0  0 12]
[ 0  0  0  0  0  0  0  0  0  4  0  0 61]
[ 0  0  0  0  0  0  0  2  0  0  0  0 30]
[ 0  0  0  0  0  0  1  0  0  0  0  0 16]
[ 0  0  0  0  0  0  0  0  0  0  0  0 18]
[ 1  1  0  0  0  0  0  0  0  0  0  0 23]
[ 0  0  0  0  0  0  0  0  1  0  0  0 19]
[  1   0   1   0   1   0   1   1   0   0   0   1 124]
[ 2  1  1  1  0  0  0  0  0  1  0  0 60]
[ 4  0  1  0  0  0  0  0  0  0  0  0 74]
[ 0  0  0  0  0  0  0  0  0  0  0  0 56]
[ 1  0  0  0  0  0  0  0  0  0  0  0 17]
[ 2  1  0  0  0  0  0  0  0  0  0  0 13]
[ 6  0  0  0  0  0  0  1  0  0  0  0 77]
[ 0  0  0  0  0  0  0  0  0  0  0  0 22]
[ 3  0  0  0  0  0  0  0  0  0  0  0 51]
[0 0 0 0 0 0 0 0 0 0 0 0 4]
[  5   0   0   0   0   0   0   0   0   1   1   0 203]
[ 0  1  1  0  0  0  0  1  0  0  0  0 13]
[ 2  0  0  0  0  0  0  0  0  0  0  1 40]
[ 5  0  0  0  0  0  0  0  0  1  0  0 88]
[  0   0   0   0   0   1   0   0   0   1   0   0 137]
[ 10   1   3   1   3   0   0   

[ 2  1  1  0  0  0  1  0  1  0  0  1 68]
[ 1  2  0  0  0  0  0  0  0  0  0  0 21]
[0 1 0 0 0 0 1 0 0 0 0 0 7]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[ 3  2  0  0  0  1  1  0  0  0  0  0 60]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 2  0  0  0  0  0  0  0  0  0  0  0 26]
[ 1  1  0  0  0  0  0  1  0  1  0  0 19]
[ 0  1  1  0  0  0  0  0  0  0  0  0 22]
[ 0  0  1  1  0  0  0  0  0  0  0  1 47]
[  7   0   1   0   0   0   0   0   0   1   0   1 110]
[ 7  0  0  0  0  0  1  0  0  0  0  0 58]
[  8   0   0   0   0   0   0   2   0   0   0   0 127]
[ 0  1  0  1  0  1  0  0  0  0  0  0 17]
[ 0  0  0  0  0  0  0  0  0  0  0  0 26]
[  0   0   0   1   0   0   0   0   0   1   0   0 133]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  0  0  0 16]
[ 1  0  0  1  0  0  0  0  0  0  0  1 26]
[ 0  0  1  0  0  0  0  0  0  0  0  1 13]
[ 1  1  0  0  0  0  1  0  0  0  0  0 76]
[ 0  0  0  0  0  0  0  1  0  0  0  0 12]
[ 0  1  0  0  1  1  1  0  0  0  0  0 47]
[0 1 0 0 0 0 0 0 0 0 0 0 2]
[ 1  1  0  0  0 

[  4   1   1   0   0   1   0   0   0   1   0   1 128]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 2  0  0  0  0  0  0  0  0  0  0  0 59]
[ 0  1  0  0  0  0  0  0  0  0  0  0 24]
[ 0  0  0  0  0  0  0  2  0  0  0  0 32]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[0 0 0 0 0 0 0 0 0 0 0 0 9]
[ 3  0  0  0  0  0  0  0  0  0  0  0 86]
[ 1  2  1  0  0  0  1  0  0  0  0  0 32]
[ 0  3  0  1  0  0  0  0  0  0  0  0 82]
[ 7  0  0  0  0  0  0  0  0  0  0  0 82]
[ 4  0  1  0  0  0  0  0  0  0  0  0 53]
[ 1  0  0  0  0  0  1  0  0  0  0  1 22]
[ 1  1  0  0  0  0  0  0  0  0  0  0 18]
[ 1  1  0  0  0  0  0  0  0  1  0  0 13]
[ 0  1  0  0  0  0  0  0  0  0  0  0 18]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[ 2  0  2  0  0  0  0  1  1  0  0  0 71]
[ 1  0  0  0  0  0  0  0  2  0  0  0 91]
[1 0 0 0 0 0 0 0 0 0 0 0 5]
[ 2  0  0  0  0  0  2  0  0  0  0  0 36]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[ 0  0  0  0  0  0  0  0  0  0  0  0 90]
[ 0  0  0  0  0  0  0  0  0  0  0  0 25]
[ 2  0  0  0  0  0  1  0  0  0  0  0 23]
[0 0 0 0 0 0 0 0 0 0 0 0 6]
[ 0  2  0  0

[ 2  1  0  0  0  0  0  0  0  0  0  0 25]
[ 0  0  0  0  0  0  0  0  0  0  0  0 15]
[ 0  0  0  0  1  0  0  0  0  0  0  0 14]
[  1   0   0   0   1   3   0   0   0   0   0   0 183]
[ 2  0  0  0  0  0  0  0  0  1  0  0 14]
[ 23   6   1   1   7   1   1   1   1   1   0   3 838]
[0 0 0 1 0 0 0 0 0 0 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  1 12]
[ 1  1  0  1  0  0  0  0  0  0  0  0 24]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[0 2 0 0 0 0 1 0 0 0 0 0 9]
[ 0  0  0  0  0  0  0  1  0  0  0  0 63]
[ 2  1  1  0  0  0  0  0  0  0  0  0 24]
[ 0  0  0  1  0  1  0  0  0  0  0  0 27]
[  1   0   0   1   0   1   0   0   0   0   0   0 185]
[ 0  0  1  0  0  0  0  0  0  0  0  0 69]
[ 0  0  0  1  0  0  0  0  0  0  0  1 20]
[ 1  0  0  0  0  0  0  2  0  0  0  0 70]
[ 0  1  0  0  0  0  0  0  0  0  1  0 49]
[ 1  0  0  0  0  0  0  2  0  0  0  1 22]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 0  0  0  0  0  0  0  3  0  0  0  0 72]
[ 0  0  0  1  1  0  0  0  0  0  0  0 73]
[ 1  0  0  1  0  0  0  0  0  

[2 0 0 0 0 0 0 0 0 0 0 0 5]
[ 0  1  0  1  0  0  0  0  0  0  0  0 14]
[ 0  0  0  1  1  0  0  0  0  0  0  0 25]
[ 2  0  0  0  0  0  0  0  0  0  0  1 88]
[ 0  2  0  0  0  1  0  0  0  0  0  1 25]
[0 1 0 1 0 0 0 0 0 0 0 0 6]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 0  0  0  0  0  0  2  0  0  0  0  1 29]
[  2   1   0   0   0   0   0   0   1   0   0   0 126]
[ 0  0  0  0  0  0  0  0  0  0  0  0 27]
[ 0  0  0  0  0  0  0  0  0  0  0  0 23]
[0 1 0 0 0 0 0 0 0 0 0 0 5]
[ 0  0  0  0  0  0  0  0  0  1  0  0 12]
[ 0  0  0  0  0  0  0  0  0  0  0  0 13]
[ 0  0  0  0  0  0  0  0  0  1  0  0 17]
[ 0  0  0  0  0  0  0  0  0  0  0  0 23]
[0 0 0 0 0 0 0 0 0 0 0 0 3]
[ 0  0  0  0  0  0  0  0  0  0  0  0 21]
[ 0  0  1  0  0  0  0  0  0  0  0  0 87]
[ 0  0  0  0  0  0  0  1  0  0  0  0 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0 14]
[ 0  0  0  0  0  0  0  2  0  0  0  0 25]
[ 0  0  0  0  0  1  0  0  0  1  0  0 50]
[ 2  0  0  0  0  0  0  0  0  0  0  0 42]
[0 1 0 0 0 0 0 0 0 0 0 0 3]
[ 0  0  0  0  0  0  0  0  0

[1 0 0 0 0 0 0 0 0 0 0 0 9]
[ 2  0  0  0  0  0  1  0  0  0  0  1 36]
[ 2  0  3  0  0  0  1  0  0  1  0  0 64]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[ 0  0  0  0  0  0  0  0  0  0  0  0 10]
[ 0  0  0  0  0  0  0  0  0  0  0  0 14]
[ 1  0  0  0  0  0  0  0  0  0  0  0 13]
[0 0 0 0 0 0 0 0 0 1 0 0 7]
[ 0  0  0  0  0  0  0  0  0  0  0  0 28]
[ 1  0  1  0  0  0  0  0  0  0  0  0 51]
[ 0  1  0  0  0  0  1  0  0  0  0  0 32]
[ 0  1  0  0  0  0  0  0  0  0  0  0 12]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[  6   4   1   1   0   0   0   0   0   0   0   0 118]
[  4   0   2   0   0   1   0   1   0   0   0   2 245]
[ 1  0  0  0  0  0  0  0  0  1  0  0 49]
[ 0  0  0  0  0  0  0  0  0  0  0  0 53]
[0 0 0 0 0 0 0 0 0 0 0 0 2]
[  9   3   1   3   1   0   0   0   0   1   0   0 180]
[1 0 0 0 0 0 0 0 0 0 0 0 4]
[ 0  0  0  1  0  0  1  0  0  0  0  0 29]
[ 0  0  0  0  0  0  0  1  0  0  0  0 35]
[ 4  0  0  0  0  0  0  1  0  0  0  0 40]
[0 0 0 0 0 0 0 0 0 0 0 0 8]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 1  1  0  1  0  0  0  0  0  1  0  0 61]

[0 1 0 0 0 0 0 0 0 0 0 0 9]
[  1   0   0   1   0   0   0   2   0   0   0   1 174]
[ 0  1  0  0  0  0  0  0  0  0  0  0 18]
[  4   0   0   2   0   0   0   0   0   0   0   0 107]
[ 0  0  0  0  0  0  0  1  0  0  0  0 22]
[ 1  0  2  0  0  0  0  0  0  0  0  0 29]
[ 0  0  0  0  0  0  0  0  0  0  0  0 37]
[ 0  0  0  0  0  0  0  1  0  0  0  0 13]
[ 1  0  0  0  0  0  0  0  0  0  0  0 18]
[ 0  0  0  1  0  0  0  0  0  1  0  0 51]
[ 0  0  1  0  0  0  0  0  0  0  0  0 38]
[ 1  0  0  0  0  0  0  0  0  0  0  0 37]
[ 0  0  0  1  0  0  0  0  0  0  0  0 51]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[ 0  0  1  0  0  0  0  0  0  0  0  0 58]
[0 0 0 0 0 0 0 0 0 0 0 0 7]
[ 0  1  0  0  0  0  0  0  0  0  0  0 17]
[ 0  0  0  0  0  0  0  0  0  0  0  0 36]
[ 0  1  0  0  0  0  0  0  0  0  0  0 28]
[ 3  0  0  0  0  0  0  0  0  0  0  0 50]
[ 0  0  1  0  0  0  0  0  0  0  0  0 18]
[ 7  0  0  0  0  0  0  1  0  0  0  0 55]
[  6   4   1   1   1   0   0   1   0   0   0   0 104]
[ 5  0  0  1  0  1  0  0  0  0  0  0 84]
[  

[ 0  0  0  0  0  1  0  0  0  1  0  0 46]
[ 1  0  0  0  0  0  0  0  0  0  0  1 44]
[ 0  0  1  0  0  0  0  0  0  0  0  0 37]
[0 0 0 0 0 0 0 0 0 0 0 0 5]
[ 1  1  1  0  0  0  2  0  0  0  0  1 32]
[ 0  0  0  0  0  0  0  0  0  0  0  0 73]
[ 0  0  0  0  0  0  0  0  0  0  0  0 50]
[ 0  0  0  0  0  0  0  0  0  0  0  0 11]
[ 1  1  0  0  0  0  0  0  0  0  0  0 20]
[0 1 0 0 0 0 0 0 0 0 0 0 9]
[ 3  0  0  0  0  0  0  0  0  0  0  0 34]
[ 1  0  0  0  0  0  0  0  0  0  0  0 14]
[ 0  0  0  0  0  0  0  0  0  0  0  0 22]
[0 0 0 0 0 0 0 0 0 0 0 1 6]
[ 0  0  0  0  0  0  0  1  0  0  0  0 37]
[ 1  0  0  0  0  0  0  0  0  0  0  0 17]
[ 0  0  0  0  0  0  0  0  0  0  1  0 35]
[ 0  0  0  0  2  0  0  0  0  0  0  0 48]
[ 3  2  0  0  0  0  0  0  0  0  0  0 36]
[ 1  1  0  0  0  0  0  0  1  1  0  0 81]
[ 2  2  0  1  0  0  0  0  0  0  0  0 77]
[ 1  0  0  1  0  0  0  0  0  0  0  0 44]
[ 3  3  0  2  0  0  0  2  0  0  0  0 43]
[ 0  1  0  0  0  0  0  0  0  0  0  0 22]
[ 0  0  0  1  0  0  0  0  0  0  0  0 23]
[ 0  0  0  0  

In [12]:
import re
import json
import numpy as np

def load_comments(filename, size, bucket_limit):
    bucket_count = dict()
    features = ["you", "?", "who", "what", "where", "when", "why", "right", "wrong", "think", "seriously", "know"]
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                feature = np.zeros(len(features)+1)
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                else:
                    bucket = 0
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    for word in comment['body'].split():
                        word = word.lower()
                        regex = re.compile('[^a-zA-Z]')
                        word = regex.sub('',word)
                        for i in range(len(features)):
                            if features[i] in word:
                                feature[i] += 1
                    feature[len(feature)-1] = len(comment['body'])
                    X.append(feature)
                    Y.append(bucket)

    return X, Y

print("Loading Reddit Comments")
train_X, train_Y = load_comments('redditnews/RedditTrain', 100000, 50000)
dev_X, dev_Y = load_comments('redditnews/RedditDev', 10000, 5000)
print("Loaded Reddit Comments")
evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

Loading Reddit Comments
Loaded Reddit Comments
Number of Hidden Neurons per layer = 1538
For relu, sgd, 5, 1:
Train Accuracy: 0.5427
Dev Accuracy: 0.5405
For relu, sgd, 5, 2:




KeyboardInterrupt: 

In [None]:
import re
import json
import numpy as np
from sklearn import metrics

def load_comments(filename, size, bucket_limit, vocab = None):
    bucket_count = dict()
    X = []
    Y = []
    comments = []
    vocabulary = set()
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            if len(X) < size:
                bucket = 0
                if comment['num_child_comments'] == 0:
                    bucket = 1
                else:
                    bucket = 0
                if bucket == 0:
                    for word in comment['body'].split():
                        word = word.lower()
                        regex = re.compile('[^a-zA-Z]')
                        word = regex.sub('',word)
                        vocabulary.add(word)
                if bucket not in bucket_count.keys():
                    bucket_count[bucket] = 0
                if bucket_count[bucket] < bucket_limit:
                    bucket_count[bucket] += 1
                    comments.append(comment['body'])
                    Y.append(bucket)
    if vocab == None:
        vocabulary = list(vocabulary)
    else:
        vocabulary = list(vocab)
    total = len(comments)
    count = 0.0
    last = 0
    for comment in comments:
        feature = np.zeros(len(vocabulary))
        for v in range(len(vocabulary)):
            if vocabulary[v] in comment:
                feature[v] += 1
        X.append(feature)
        count += 1.0
        if round(count/total*100) != last:
            last = round(count/total*100)
            if last % 10 == 0:
                print("Progress: {}%".format(last))
    return X, Y, vocabulary

print("Loading Reddit Comments")
# train_X, train_Y, vocabulary = load_comments('redditnews/RedditTrain', 10000, 5000)
print("Loaded Train Reddit Comments")
# dev_X, dev_Y, _ = load_comments('redditnews/RedditDev', 1000, 500, vocabulary)
print("Loaded Dev Reddit Comments")
# evaluate(train_X, train_Y, dev_X, dev_Y)
prediction = np.ones(len(dev_Y))
print(metrics.accuracy_score(dev_Y, prediction))

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(train_X, train_Y)
prediction = clf.predict(dev_X)
train_acc = clf.score(train_X, train_Y)
dev_acc = metrics.accuracy_score(dev_Y, prediction)
print("Train Accuracy: {}".format(train_acc))
print("Dev Accuracy: {}".format(dev_acc))

Loading Reddit Comments
Loaded Train Reddit Comments
Loaded Dev Reddit Comments
0.5
