In [1]:
import numpy as np
import pandas as pd
import sqlite3
import gensim
import re
from nltk.corpus import stopwords
import nltk

In [2]:
sql_conn = sqlite3.connect('../data/database.sqlite')

In [3]:
# These functions are needed for processing later

# Takes a sentence in a comment and converts it to a list of words.
def comment_to_wordlist(comment, remove_stopwords=False ):
    comment = re.sub("[^a-zA-Z]"," ", comment)
    words = comment.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)

# Takes a comment and converts it to an array of sentences
def comment_to_sentence(comment, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(comment.strip())
    
    sentences = []
    for s in raw_sentences:
        if len(s)>0:
            sentences.append(comment_to_wordlist(s, remove_stopwords))
    #rof
    return sentences

### Choice of subreddits

These subreddits contain discussions on 11 fairly distinct topic, from a human point of view.

In [4]:
mathematics = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'mathematics'",sql_conn)

computerscience = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'computerscience'",sql_conn)

history = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'history'",sql_conn)

philosophy = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'philosophy'",sql_conn)

elifive = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'explainlikeimfive'",sql_conn)

askanthro = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'AskAnthropology'",sql_conn)

homebrewing = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'Homebrewing'",sql_conn)

bicycling = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'bicycling'", sql_conn)

food = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'food'", sql_conn)

gaming = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'gaming'", sql_conn)

politics = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'politics'", sql_conn)

In [5]:
# Array of tuples, with df and subject
subreddits = [(bicycling,'bicycling'),(history,'history'),(philosophy,'philosophy'),
              (elifive,'explain'),(homebrewing,'homebrew'),(askanthro,'anthropology'),
              (mathematics,'mathematics'),(computerscience,'computer science'),
              (food,"food"),(gaming,"gaming"),(politics,"politics")]

In [6]:
all_frames = [bicycling, history, philosophy, elifive, homebrewing, askanthro, mathematics,\
              computerscience, food, gaming, politics]
model_training_data = pd.concat(all_frames, ignore_index=True)
model_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999300 entries, 0 to 999299
Data columns (total 2 columns):
subreddit    999300 non-null object
body         999300 non-null object
dtypes: object(2)
memory usage: 22.9+ MB


In [15]:
# Get all model files
from os import listdir
from os.path import isfile, join
models = [f for f in listdir('models/') if isfile(join('models/', f)) and not f.endswith('.npy')]
print(models)

['300features_50minwords_10context', '300features_40minwords_3context', '300features_20minwords_3context', '300features_40minwords_5context', '300features_10minwords_3context', '300features_10minwords_5context', '300features_50minwords_5context', '300features_30minwords_5context', '300features_40minwords_10context', '300features_20minwords_5context', '300features_10minwords_10context', '300features_20minwords_10context', '300features_30minwords_3context', '300features_50minwords_3context', '300features_30minwords_10context']


In [16]:
def f(word, label):
    return model.similarity(word, label)

def label_comment(comment, labels):
    # Set initial distance high
    best_distance = 1e8
    best_label = ""
    for label in labels:
        word_dists_to_label = map(f(word,label), comment)
        # Compute average similarity to label
        dist = sum(word_dists_to_label)/len(word_dists_to_label)
        if dist < best_distance:
            best_label = label
            best_distance = dist

In [18]:
from gensim.models import word2vec

# Attempt to load each model
for m in models:
    print(m)
    current_model = word2vec.Word2Vec.load('models/' + m);
    
    # Take each comment, compare 

300features_50minwords_10context
300features_40minwords_3context
300features_20minwords_3context
300features_40minwords_5context
300features_10minwords_3context
300features_10minwords_5context
300features_50minwords_5context
300features_30minwords_5context
300features_40minwords_10context
300features_20minwords_5context
300features_10minwords_10context
300features_20minwords_10context
300features_30minwords_3context
300features_50minwords_3context
300features_30minwords_10context


In [60]:
current_model.most_similar("fallout")

[(u'vegas', 0.707527756690979),
 (u'scrolls', 0.6396520137786865),
 (u'morrowind', 0.6132481098175049),
 (u'eso', 0.6038081645965576),
 (u'nv', 0.5972005128860474),
 (u'tes', 0.5942018032073975),
 (u'skyrim', 0.5849278569221497),
 (u'oblivion', 0.5718768835067749),
 (u'obsidian', 0.5705356597900391),
 (u'series', 0.5462707281112671)]