In [2]:
import math
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Tom
data = '/usr/src/app/data/ka-comments.csv'
model_dir = '/usr/src/app/model-data/'

In [None]:
# Kyle
data = '/Users/koza/Documents/UCBerkeley/266/FinalProject/data/ka-comments.csv'
model_dir = '/Users/koza/Documents/UCBerkeley/266/FinalProject/model-data/'

In [7]:
# Cory
data = '/path/to/data/ka-comments.csv'
model_dir = '/path/to/model-data/'

In [3]:
comments = pd.read_csv(data)

print "Total size: %d" % len(comments)

print comments[:10]

Total size: 1145440
                  video                                        id  \
0  ancient-temples-nara  33ae6dffe9efdf1bba47412945f3f96ee85e39f7   
1  ancient-temples-nara  ea997873e7cd069c8a6bf2f14809950ac237001c   
2  ancient-temples-nara  61b80f281d1ac24d102535496ad3a963e226cdeb   
3  ancient-temples-nara  49349c01d95f512a04da8f32d54e4aa22d7ec8a9   
4  ancient-temples-nara  847d456003e470c236375a5195d87dc94b46cfc1   
5  ancient-temples-nara  348392e7888687e821de048609cace4a08da4e18   
6  ancient-temples-nara  58f2adc07ebca4285963c55b2736449f900458bd   
7  ancient-temples-nara  96ca02013d5c2bfe8a1bc29932764a3484d47979   
8  ancient-temples-nara  a0c8b56cac318667d2a8aa124b63ff1d00f97fb0   
9  ancient-temples-nara  dac041cb6faff46c806323837292d153b478c4b6   

                                             content  \
0  Why does the pagoda have 5 stories? Wouldn't t...   
1  5 stories represent  5 elements make up our wo...   
2  Because it's easier to calculate the middle wi...

In [4]:

plt.hist(comments["sumVotesIncremented"], 100, facecolor='green', log=True, alpha=0.75)
plt.show()

In [5]:
def transform_scores(data):
    # Turn `sumVotesIncremented` into a binary variable
    data["hasVotes"] = data["sumVotesIncremented"] > 1

    # Count positives
    positive_count = np.sum(data["hasVotes"])
    print "%d / %d comments have votes" % (positive_count, len(data))

    # Generate a balanced set, interleaved TFTF... so we can easily
    # create balanced training/test sets by taking slices
    positive_indices = np.where(data["hasVotes"] == True)[0]
    negative_indices = np.where(data["hasVotes"] == False)[0]
    balanced_indices = [val for pair in zip(positive_indices, negative_indices[:len(positive_indices)]) for val in pair]
    print "%d balanced indices" % len(balanced_indices)

    return data.iloc[balanced_indices]
    
balanced_comments = transform_scores(comments)

VALIDATE_SIZE = 5000
TEST_SIZE = 5000
TRAIN_SIZE = len(balanced_comments) - VALIDATE_SIZE - TEST_SIZE
print "Training set %d, Validate set %d, Test set 5000 %d" % (TRAIN_SIZE, VALIDATE_SIZE, TEST_SIZE)

print balanced_comments[:10] 


254896 / 1145440 comments have votes
509792 balanced indices
Training set 499792, Validate set 5000, Test set 5000 5000
                   video                                        id  \
0   ancient-temples-nara  33ae6dffe9efdf1bba47412945f3f96ee85e39f7   
6   ancient-temples-nara  58f2adc07ebca4285963c55b2736449f900458bd   
1   ancient-temples-nara  ea997873e7cd069c8a6bf2f14809950ac237001c   
12  ancient-temples-nara  bd7b4e50832d4edb4b4383ba518198574878b1cb   
2   ancient-temples-nara  61b80f281d1ac24d102535496ad3a963e226cdeb   
13  ancient-temples-nara  e686507bd6e75f003b0d5bd9618444a932379e75   
3   ancient-temples-nara  49349c01d95f512a04da8f32d54e4aa22d7ec8a9   
14  ancient-temples-nara  c88f018582f46316de75bb2671dd82335bf3cde3   
4   ancient-temples-nara  847d456003e470c236375a5195d87dc94b46cfc1   
15  ancient-temples-nara  37b39a27c8fbac57d934266708d94a0fed95de39   

                                              content  \
0   Why does the pagoda have 5 stories? Wouldn't t..

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.random_projection import GaussianRandomProjection

def vectorize(data_name, vectorizer, data, train_size, validate_size, test_size):
    print "Vectorizing %s..." % data_name

    model = {}
    
    model['X_train'] = vectorizer.fit_transform(comments["content"][:train_size])
    model['Y_train'] = comments["hasVotes"][:train_size]
    
    print "Vocab size %d" % len(vectorizer.vocabulary_)
        
    model['X_validate'] = vectorizer.transform(comments["content"][train_size:train_size+validate_size])
    model['Y_validate'] = comments["hasVotes"][train_size:train_size+validate_size]
    
    model['X_test'] = vectorizer.transform(comments["content"][train_size+validate_size:train_size+validate_size+test_size])
    model['Y_test'] = comments["hasVotes"][train_size+validate_size:train_size+validate_size+test_size]
    
    with open(model_dir+"%s.pickle" % data_name, "wb") as f:
        pickle.dump(model, f)

# 1-grams
vectorize("vec-tfidf-1", TfidfVectorizer(min_df=0.0005),
          balanced_comments, TRAIN_SIZE, VALIDATE_SIZE, TEST_SIZE)

# 2-grams
vectorize("vec-tfidf-2", TfidfVectorizer(min_df=0.0005, ngram_range=(1,2)),
          balanced_comments, TRAIN_SIZE, VALIDATE_SIZE, TEST_SIZE)

# 3-grams
vectorize("vec-tfidf-3", TfidfVectorizer(min_df=0.0005, ngram_range=(1,3)),
          balanced_comments, TRAIN_SIZE, VALIDATE_SIZE, TEST_SIZE)

print "Done."


Vectorizing vec-tfidf-1...
Vocab size 3258


IOError: [Errno 2] No such file or directory: '/Users/koza/Documents/UCBerkeley/266/FinalProject/model-data/vec-tfidf-1.pickle'

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

def load_model(data_name):
    with open("/usr/src/app/model-data/%s.pickle" % data_name, "rb") as f:
        return pickle.load(f)

def fit_models(data_names, model_types):
    for data_name in data_names:
        print "Loading model %s..." % data_name
        data_model = load_model(data_name)
        print np.shape(data_model['X_train'])

        print "Training models."
        for model_name, model in model_types:
            model.fit(data_model['X_train'], data_model['Y_train'])
            score = model.score(data_model['X_validate'], data_model['Y_validate'])
            print "## %20s %15s accuracy: %0.1f %%" % (model_name, data_name, score * 100)
    
fit_models(["vec-tfidf-1", "vec-tfidf-2", "vec-tfidf-3"], [
    ["MultinomialNB", MultinomialNB()],
    ["LinearSVC", LinearSVC()],
    ["MLP", MLPClassifier(hidden_layer_sizes=(20,20), early_stopping=True)],
])
