# Project Ruby - Baseline

## Goals: 
1. Predict how many comments will follow a given comment.
2. Generate comments that will generate most children comments.

In [1]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id']
    subreddits = ['news']
    comments_size = 100000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[0])
    return comments

comments = read_dataset()

Example Comment:
{'body': "'Cause your dick might hit a butt and make baby Jesus cry. ", 'author': 'Number6isNo1', 'subreddit': 'news', 'id': 'dbumnrk', 'parent_id': 't1_dbumi82', 'score': 26, 'controversiality': 0, 'link_id': 't3_5lbkgm'}


# Baseline 1: Logistic Regression with Unigram Features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

def build_vocabulary(comments):
    vocabulary = set()
    for comment in comments:
        for word in comment:
            vocabulary.add(word)
    return vocabulary

def prepare_train(comments, vocabulary, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    vocabulary = list(vocabulary)
    cv = CountVectorizer(vocabulary=vocabulary)
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        train_X.append(comment['body'])
        train_Y.append(comment['score'])
    train_X = cv.fit_transform(train_X)

    for comment in test:
        test_X.append(comment['body'])
        test_Y.append(comment['score'])
    test_X = cv.fit_transform(test_X)
    
    return train_X, train_Y, test_X, test_Y

vocabulary = build_vocabulary(comments)
train_X, train_Y, test_X, test_Y = prepare_train(comments, vocabulary, train_size=0.9)

In [3]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import numpy as np

lr = LinearRegression()
lr = lr.fit(train_X, train_Y)
print (lr.score(train_X, train_Y))
prediction = lr.predict(test_X)
# prediction = np.zeros(len(test_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

6.3401532695e-06
21.3334794347
34704.3763216


# Baseline 2: Neural Network Regression with GloVe

In [4]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.txt'
word2vec_output_file = 'word2vec.txt'
model = glove2word2vec(glove_input_file, word2vec_output_file)

from gensim.models import KeyedVectors
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.775162398815155)]


In [11]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
        sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['score'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
        sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['score'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [12]:
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

0.0713716073349
0.219620100935
0.233813820107


# Baseline 3: Predicting Number of Child Comments
with Neural Network and GloVe Vectors

In [4]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.txt'
word2vec_output_file = 'word2vec.txt'
model = glove2word2vec(glove_input_file, word2vec_output_file)

from gensim.models import KeyedVectors
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.775162398815155)]


In [32]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id', "num_child_comments"]
    subreddits = ['news']
    comments_size = 100000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month + "A"
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[1])
    return comments

comments = read_dataset()

Example Comment:
{'body': 'I think it was more for the fact that they were saying to do it, in order to save money.', 'author': 'potrg801', 'subreddit': 'news', 'id': 'dbumnt2', 'parent_id': 't1_dbtt6ak', 'score': 1, 'controversiality': 0, 'link_id': 't3_5l77ad', 'num_child_comments': 0}


In [33]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['num_child_comments'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['num_child_comments'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [34]:
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
# prediction = np.zeros(len(test_X))
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

-0.34002538221
0.284969198496
1.11329125027


# Baseline 5: Predicting Number of Child Comments
with Neural Network and GloVe Vectors + Exclude comments with zero child comments

In [35]:
import json

def read_dataset():
    comments = []
    features = ['body', 'author', 'subreddit', 'id', 'parent_id', 'score', 'controversiality', 'link_id', "num_child_comments"]
    subreddits = ['news']
    comments_size = 10000
    years = ["2017"]
    months = ["01"]
    for year in years:
        for month in months:
            filename = "reddit/" + year + "/RC_" + year + "-" + month + "A"
            with open(filename, "r") as data:
                for line in data:
                    if len(comments) < comments_size:
                        comment_data = json.loads(line)
                        if comment_data['subreddit'] in subreddits and comment_data['num_child_comments'] > 0:
                            comment = dict()
                            for feature in features:
                                comment[feature] = comment_data[feature]
                            comments.append(comment)
                    else:
                        break
    print ("Example Comment:")
    print (comments[1])
    return comments

comments = read_dataset()

Example Comment:
{'body': '[removed]', 'author': '[deleted]', 'subreddit': 'news', 'id': 'dbumpip', 'parent_id': 't3_5l5aew', 'score': 1, 'controversiality': 0, 'link_id': 't3_5l5aew', 'num_child_comments': 1}


In [36]:
import numpy as np

def prepare_train(comments, train_size):
    train = comments[:int(train_size * len(comments))]
    test = comments[len(comments)-len(train):]
    wordvec_dim = model['test'].shape
    
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []

    for comment in train:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        train_X.append(sentence_vector)
        train_Y.append(comment['num_child_comments'])

    for comment in test:
        sentence_vector = np.zeros(wordvec_dim)
        count = 0
        for word in comment['body']:
            if word in model.vocab:
                count += 1
                sentence_vector += model[word]
#         sentence_vector /= count
        test_X.append(sentence_vector)
        test_Y.append(comment['num_child_comments'])
    
    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = prepare_train(comments, train_size=0.9)

In [37]:
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(test_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(test_Y, prediction))
print (metrics.mean_squared_error(test_Y, prediction))

-0.00064974642348
7.65527630524
2218.28397051


# Baseline 6: Neural Network with GloVe (Larger Dataset)
On 2016 Reddit Comments from r/news

Bucketizing into 5 buckets

In [None]:
import json
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

def load_comments(filename):
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            sentence_vector = np.zeros(model['test'].shape)
            count = 0
            for word in comment['body']:
                if word in model.vocab:
                    count += 1
                    sentence_vector += model[word]
#             sentence_vector /= count
            X.append(sentence_vector)
            bucket = 0
            if comment['num_child_comments'] == 0:
                bucket = 1
            elif comment['num_child_comments'] >= 1 and comment['num_child_comments'] <= 2:
                bucket = 2
            elif comment['num_child_comments'] >= 3 and comment['num_child_comments'] <= 6:
                bucket = 3
            elif comment['num_child_comments'] >= 7 and comment['num_child_comments'] <= 14:
                bucket = 4
            elif comment['num_child_comments'] >= 15:
                bucket = 5
            Y.append(bucket)
    return X, Y

train_X, train_Y = load_comments('redditnews/RedditTrain')
dev_X, dev_Y = load_comments('redditnews/RedditDev')
print('Loaded Reddit Comments from 2016')

In [None]:
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt

nn = MLPRegressor(
    hidden_layer_sizes=(100,) * 4,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(train_X, train_Y)
prediction = nn.predict(dev_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(dev_Y, prediction))
print (metrics.mean_squared_error(dev_Y, prediction))

In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np
import matplotlib.pyplot as plt

nn = MLPClassifier(solver='adam', alpha=1e-5,
                   hidden_layer_sizes=(100,) * 4, random_state=1)
nn.fit(train_X, train_Y)
prediction = nn.predict(dev_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(dev_Y, prediction))
print (metrics.mean_squared_error(dev_Y, prediction))

Binary (0 or not)

In [None]:
import json
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

def load_comments(filename):
    X = []
    Y = []
    with open(filename, "r") as data:
        for line in data:
            comment = json.loads(line)
            sentence_vector = np.zeros(model['test'].shape)
            count = 0
            for word in comment['body']:
                if word in model.vocab:
                    count += 1
                    sentence_vector += model[word]
#             sentence_vector /= count
            X.append(sentence_vector)
            bucket = 0
            if comment['num_child_comments'] != 0:
                bucket = 1
            Y.append(bucket)
    return X, Y

train_X, train_Y = load_comments('redditnews/RedditTrain')
dev_X, dev_Y = load_comments('redditnews/RedditDev')
print('Loaded Reddit Comments from 2016')

In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np
import matplotlib.pyplot as plt

nn = MLPClassifier(solver='adam', alpha=1e-5,
                   hidden_layer_sizes=(100,) * 4, random_state=1)
nn.fit(train_X, train_Y)
prediction = nn.predict(dev_X)
print (nn.score(train_X, train_Y))
print (metrics.mean_absolute_error(dev_Y, prediction))
print (metrics.mean_squared_error(dev_Y, prediction))