In [1]:
import numpy as np
import pandas as pd
import sqlite3
import gensim
import re
from nltk.corpus import stopwords
import nltk

In [2]:
sql_conn = sqlite3.connect('../data/database.sqlite')

In [3]:
# These functions are needed for processing later

# Takes a sentence in a comment and converts it to a list of words.
def comment_to_wordlist(comment, remove_stopwords=False ):
    comment = re.sub("[^a-zA-Z]"," ", comment)
    words = comment.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)

# Takes a comment and converts it to an array of sentences
def comment_to_sentence(comment, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(comment.strip())
    
    sentences = []
    for s in raw_sentences:
        if len(s)>0:
            sentences.append(comment_to_wordlist(s, remove_stopwords))
    #rof
    return sentences

### Choice of subreddits

These subreddits contain discussions on 11 fairly distinct topic, from a human point of view.

In [4]:
mathematics = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'mathematics'",sql_conn)

computerscience = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'computerscience'",sql_conn)

history = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'history'",sql_conn)

philosophy = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'philosophy'",sql_conn)

elifive = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'explainlikeimfive'",sql_conn)

askanthro = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'AskAnthropology'",sql_conn)

homebrewing = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'Homebrewing'",sql_conn)

bicycling = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'bicycling'", sql_conn)

food = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'food'", sql_conn)

gaming = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'gaming'", sql_conn)

politics = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'politics'", sql_conn)

In [5]:
# Array of tuples, with df and subject
subreddits = [(bicycling,'bicycling'),(history,'history'),(philosophy,'philosophy'),
              (elifive,'explain'),(homebrewing,'homebrew'),(askanthro,'anthropology'),
              (mathematics,'mathematics'),(computerscience,'cs'),
              (food,"food"),(gaming,"gaming"),(politics,"politics")]

In [18]:
all_frames = [bicycling, history, philosophy, elifive, homebrewing, askanthro, mathematics,\
              computerscience, food, gaming, politics]
model_training_data = pd.concat(all_frames, ignore_index=True)
model_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999300 entries, 0 to 999299
Data columns (total 2 columns):
subreddit    999300 non-null object
body         999300 non-null object
dtypes: object(2)
memory usage: 22.9+ MB


In [6]:
# Get all model files
from os import listdir
from os.path import isfile, join
models = [f for f in listdir('models/') if isfile(join('models/', f)) and not f.endswith('.npy')]
print(models)

['300features_50minwords_10context', '300features_40minwords_3context', '300features_20minwords_3context', '300features_40minwords_5context', '300features_10minwords_3context', '300features_10minwords_5context', '300features_50minwords_5context', '300features_30minwords_5context', '300features_40minwords_10context', '300features_20minwords_5context', '300features_10minwords_10context', '300features_20minwords_10context', '300features_30minwords_3context', '300features_50minwords_3context', '300features_30minwords_10context']


In [7]:
import math

# Method for computing similarity to a specific label
# Note this returns the cosine similarity between 
# the vector representation of the words and label
def similarities(comment, label, model):
    dists = []
    for word in comment:
        if word in model.vocab:
            dists.append(model.similarity(word,label))
    return dists

def label_comment(comment, labels, model):
    # Set initial distance to be 1-(-1) (complete dissimilarity)
    best_distance = 2
    best_label = ""
    for label in labels:
        # Range will be from [-1,1]
        word_dists_to_label = similarities(comment, label, model)
        
        # We want to choose the label with overall sum closest to 1
        # or
        # We want to minimize our distance to 1
        dist = 1 - sum(word_dists_to_label)
        if dist < best_distance:
            best_label = label
            best_distance = dist
    return best_label

In [15]:
from gensim.models import word2vec

current_model = word2vec.Word2Vec.load('models/' + models[2])
print(models[0])
# Attempt to load each model
#for m in models:
#    print(m)
#    current_model = word2vec.Word2Vec.load('models/' + m);
    
    # Take each comment, compare 

300features_50minwords_10context


In [16]:
# This is not good... where is this coming from?
current_model.most_similar("man")

[(u'scumbag', 0.5574318766593933),
 (u'hates', 0.5410973429679871),
 (u'guy', 0.5326985716819763),
 (u'liar', 0.5264509916305542),
 (u'kidnapped', 0.519798219203949),
 (u'grandpa', 0.5184828042984009),
 (u'bastard', 0.5181668400764465),
 (u'neckbeards', 0.5177206993103027),
 (u'virgins', 0.5131043195724487),
 (u'smug', 0.5101327300071716)]

Including r/politics and r/gaming into my data set seemingly ruined good associations. People on the internet are mean.

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

comments = []
ground_truth_labels = []
for i in range(len(model_training_data)):
    comments.append(comment_to_wordlist(model_training_data.iloc[i]['body'], tokenizer))
    ground_truth_labels.append(model_training_data.iloc[i]['subreddit']) # Could also just slice this array out

In [21]:
if len(comments)==len(ground_truth_labels):
    print("Length is the same: ",len(comments))

('Length is the same: ', 999300)


In [59]:
#labels = np.unique(model_training_data['subreddit'])
#subreddits = [(bicycling,'bicycling'),(history,'history'),(philosophy,'philosophy'),
#              (elifive,'explain'),(homebrewing,'homebrew'),(askanthro,'anthropology'),
#              (mathematics,'mathematics'),(computerscience,'computer science'),
#              (food,"food"),(gaming,"gaming"),(politics,"politics")]

In [22]:
subreddit_labels = []
for frame,name in subreddits:
    subreddit_labels.append(name)
    
trained_labels = []

In [23]:
import time

#trained_labels = [None] * len(comments)
start = time.time()
for i in range(100000):
    trained_labels.append(label_comment(comments[i],subreddit_labels,current_model))
end = time.time()
duration = end-start

In [30]:
print("Took " + str(duration) + " time to label 100000 comments.")

Took 617.820972919 time to label 100000 comments.


In [31]:
len(trained_labels)

200000

In [41]:
mis_rate = 0
successes = []
for i in range(len(trained_labels)):
    if trained_labels[i] != ground_truth_labels[i]:
        mis_rate += 1
    else:
        successes.append((comments[i],trained_labels[i]))
error_rate = mis_rate/float(len(trained_labels))
success_rate = 1-error_rate

In [43]:
len(successes)

24341