In [1]:
import numpy as np
import pandas as pd
import sqlite3
import gensim
import re
from nltk.corpus import stopwords
import nltk

In [2]:
sql_conn = sqlite3.connect('../data/database.sqlite')

In [3]:
mathematics = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'mathematics'",sql_conn)

In [4]:
computerscience = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'computerscience'",sql_conn)

In [5]:
history = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'history'",sql_conn)

In [6]:
philosophy = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'philosophy'",sql_conn)

In [7]:
elifive = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'explainlikeimfive'",sql_conn)

In [8]:
askanthro = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'AskAnthropology'",sql_conn)

In [9]:
homebrewing = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'Homebrewing'",sql_conn)

In [10]:
bicycling = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'bicycling'", sql_conn)

In [11]:
food = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'food'", sql_conn)

In [12]:
science = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'science'", sql_conn)

In [13]:
movies = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'movies'", sql_conn)

In [14]:
books = pd.read_sql("SELECT subreddit, body FROM May2015 WHERE subreddit == 'books'", sql_conn)

In [18]:
# Array of tuples, with df and subject
subreddits = [(bicycling,'bicycling'),(history,'history'),(philosophy,'philosophy'),
              (elifive,'explain'),(homebrewing,'homebrew'),(askanthro,'anthropology'),
              (mathematics,'mathematics'),(computerscience,'computer'),
              (food,"food"),(science,"science"),(movies,"movies"),(books,"books")]

In [19]:
all_frames = [bicycling, history, philosophy, elifive, homebrewing, askanthro, mathematics,\
              computerscience, food, science, movies, books]
all_data = pd.concat(all_frames, ignore_index=True)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 931315 entries, 0 to 931314
Data columns (total 2 columns):
subreddit    931315 non-null object
body         931315 non-null object
dtypes: object(2)
memory usage: 21.3+ MB


In [20]:
# Takes a sentence in a comment and converts it to a list of words.
def comment_to_wordlist(comment, remove_stopwords=False ):
    comment = re.sub("[^a-zA-Z]"," ", comment)
    words = comment.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)

# Takes a comment and converts it to an array of sentences
def comment_to_sentence(comment, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(comment.strip())
    
    sentences = []
    for s in raw_sentences:
        if len(s)>0:
            sentences.append(comment_to_wordlist(s, remove_stopwords))
    return sentences

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

comments = []
ground_truth_labels = []
for i in range(len(all_data)):
    comments.append(comment_to_wordlist(all_data.iloc[i]['body'], tokenizer))
    ground_truth_labels.append(all_data.iloc[i]['subreddit']) # Could also just slice this array out

In [21]:
def reduce_comments(words, model):
    # Pre-allocate average vector representation
    average = np.zeros(300, dtype=np.float64)
    in_vocab = 0
    for word in words:
        if word in model.vocab:
            in_vocab += 1
            average = average + np.array(model[word], dtype=float)
    return np.divide(average,in_vocab)

In [22]:
from gensim.models import word2vec

m = "300features_10minwords_10context"
current_model = word2vec.Word2Vec.load('new_models/' + m);

single_rep_comments = map(lambda comment: reduce_comments(comment, current_model), comments)

In [23]:
len(single_rep_comments)

931315

In [24]:
# Check to make sure we have a 1x300 vector representation of each *comment*
for c in single_rep_comments:
    nans = 0
    if not len(c) == 300:
        print len(c)
    if True in np.isnan(c):
        nans += 1
# No output is good!

In [24]:
# Convert our vector representation to a nice numpy matrix
d = np.asarray(single_rep_comments, dtype=np.float64)

In [25]:
d.shape

(931315, 300)

In [28]:
# Pull out indices of rows where the vector representation does not contain a NaN
# Not sure how NaNs crept in here, possibly due to overflow issues
ix_non_nans = np.where(~(np.isnan(d).any(axis=1)))[0]

In [29]:
# Filter out NaNs on comments and labels
d = d[ix_non_nans]
gtls = np.asarray(ground_truth_labels)[ix_non_nans]

In [30]:
# Cluster using KMeans, initializing k = 12 (one for each of the subreddits we drew from)
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 12)
kmeans.fit(d)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=12, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [74]:
centroids = kmeans.cluster_centers_
centroids.shape

from tempfile import TemporaryFile
outfile = TemporaryFile()
np.save(outfile, centroids)

In [77]:
import pickle

output = open('centroids.pkl', 'wb')
pickle.dump(centroids, output)
output.close()

In [76]:
# Load back in:
pkl_file = open('centroids.pkl','rb')
centroids = pickle.load(pkl_file)
pkl_file.close()

In [73]:
#list(centroids[0])
current_model.most_similar(positive=[centroids[11]], topn=5)

[(u'storyteller', 0.8631452322006226),
 (u'heroines', 0.8625438809394836),
 (u'toho', 0.8603470325469971),
 (u'worldbuilding', 0.8569574356079102),
 (u'dcau', 0.856163501739502)]

In [29]:
print(np.unique(gtl))

[u'AskAnthropology' u'Homebrewing' u'bicycling' u'books' u'computerscience'
 u'explainlikeimfive' u'food' u'history' u'mathematics' u'movies'
 u'philosophy' u'science']


In [30]:
len(d) == len(gtl)

True

In [31]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=0)
svd_src = svd.fit_transform(d)

In [32]:
# Reduced to 50 dimensions from 300
svd_src.shape

(924473, 50)

In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)
red_svd_src = pca.fit_transform(svd_src)

In [34]:
red_svd_src.shape

(924473, 3)

In [35]:
dfdict = {'x': red_svd_src[:,0],
          'y': red_svd_src[:,1],
          'z': red_svd_src[:,2],
          'subreddit': gtl}

df = pd.DataFrame(dfdict)
#df.to_csv('reduced_data.csv',encoding='utf8')