# We can pick up here, loading pkl from part1

# Part 2: Modelling

In [1]:
# if you broke things again and need to restart the kernel, ... well, start here!
# Takes about 4 minutes to run, but it's a good place to pick up.

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import json
import time
import unidecode as ud
import re

# a function used for reconstructing a dictionary from a loaded dataframe.

def body_to_dict(dataframe):
    
    # turning the series of text into a list of lists of words
    list_of_list_of_words = dataframe.body.map(lambda x: x.split())

    dictionary = {}

    for list_of_words in list_of_list_of_words:
        for word in list_of_words:
            if not word in dictionary:
                dictionary[word] = 1
            else:
                dictionary[word] += 1
                
    return dictionary


df = pd.read_pickle('df2014d.pkl')
#abridged_dict = body_to_dict(df)

In [2]:
df.head()

Unnamed: 0,author,body,controversiality,created_utc,link_id,score,words,sentences,positive,reply_count,question,exclamation,hyperlinks
0,gbs5009,it may be legit in that the progressive group...,0,1390595859,t3_1vyqx3,1,91,5,1,0,1,1,0
1,bucknuggets,yeah please provide the quotes that show that...,0,1407849031,t3_2daoq2,13,51,3,1,0,0,0,0
2,[deleted],was clinton any good though last i remember h...,0,1398952953,t3_24dja2,1,50,4,1,0,1,0,0
3,Dreamstakeroot,what if i told you that both democrats and re...,0,1396628643,t3_226zi6,-8,48,3,0,0,1,0,0
4,areyakiddinme,happy cake day,0,1414456941,t3_2kgqt2,1,3,1,1,0,0,1,0


##LDA on comments

Now we will attempt to extract topics from the comments via LDA. The hope is that their are common themes of these political discussion on Reddit, and that particular topics will have a determining factor on score or controversial.

In [3]:
import gensim

Our fist attempt performs LDA on the entire reduced text from the dataframe "body" fields, without consideration for stop words. As can be imagined, this does not produce very inspiring topics, since they end up consisting almost exlusively of stop words.

#Skip down to LDA with stop words

First we split the text in df.body into lists of words, so that texts is a list of lists.

In [59]:
texts = []
for text in df.body:
    texts.append(text.split())
texts

[['it',
  'may',
  'be',
  'legit',
  'in',
  'that',
  'the',
  'progressive',
  'group',
  'it',
  'was',
  'made',
  'through',
  "hasn't",
  'disavowed',
  'it',
  'but',
  'the',
  'thought',
  'pattern',
  'that',
  'went',
  'into',
  'them',
  'just',
  'screams',
  'republican',
  'who',
  'actually',
  'thinks',
  'young',
  'people',
  'go',
  'around',
  'saying',
  'man',
  'me',
  'and',
  'my',
  'bros',
  'sure',
  'love',
  'being',
  'promiscuous',
  'and',
  "i'm",
  'loving',
  'this',
  'ad',
  'campaign',
  'telling',
  'us',
  'we',
  'can',
  'have',
  'consequence',
  'free',
  'sex',
  'thanks',
  'to',
  'the',
  'government',
  "that's",
  'just',
  'right',
  'wing',
  'fantasy',
  'land',
  'young',
  'people',
  'and',
  "it's",
  'just',
  'jarring',
  'seeing',
  'these',
  'caricatures',
  'in',
  'a',
  'ostensibly',
  'pro',
  'aca',
  'advertisement',
  "it's",
  'like',
  'seeing',
  'anti',
  'racism',
  'advertisements',
  'with',
  'people',
  '

Next we use gensim to produce the dictionary based on the text lists, and then the corpus to be fed into the LDA model.

In [60]:
%%time

dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

CPU times: user 11min 21s, sys: 10min 9s, total: 21min 30s
Wall time: 3h 41min 38s


We can store the corpus in a json file since the above process took almost 4 hours.

In [67]:
import json
with open('corpus.json', 'w') as outfile:
    json.dump(corpus, outfile)

Below we run the LDA model with 5 and 10 topics to see if there are any interesting results. As stated above, the results consist of stop words, so we must move on a more refined model.

In [69]:
lda5 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5,
                                      update_every=1, chunksize=10000, passes=1)

In [70]:
lda5.print_topics(5)

[u'0.045*the + 0.026*i + 0.021*a + 0.019*and + 0.018*is + 0.018*to + 0.017*of + 0.015*this + 0.014*that + 0.012*in',
 u'0.051*the + 0.024*to + 0.023*a + 0.023*and + 0.023*of + 0.018*in + 0.014*for + 0.013*is + 0.012*that + 0.009*deleted',
 u'0.064*the + 0.025*a + 0.025*and + 0.022*to + 0.020*of + 0.020*he + 0.019*was + 0.019*in + 0.014*that + 0.011*his',
 u'0.042*to + 0.031*you + 0.023*that + 0.023*a + 0.022*the + 0.020*i + 0.018*and + 0.018*it + 0.016*is + 0.014*they',
 u'0.086*the + 0.041*of + 0.029*to + 0.027*is + 0.021*and + 0.021*a + 0.018*in + 0.016*that + 0.012*not + 0.012*for']

In [71]:
lda10 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10,
                                      update_every=1, chunksize=10000, passes=1)

In [72]:
lda10.print_topics(10)

[u'0.080*the + 0.038*to + 0.029*of + 0.021*a + 0.019*that + 0.019*and + 0.018*is + 0.016*in + 0.014*it + 0.011*for',
 u'0.077*i + 0.033*a + 0.023*to + 0.020*it + 0.019*my + 0.018*that + 0.017*was + 0.016*and + 0.014*you + 0.012*but',
 u'0.053*the + 0.034*this + 0.028*you + 0.024*to + 0.023*for + 0.022*a + 0.021*article + 0.018*please + 0.018*of + 0.016*your',
 u'0.094*the + 0.052*of + 0.032*and + 0.030*in + 0.026*is + 0.019*a + 0.014*that + 0.013*to + 0.011*as + 0.011*are',
 u'0.043*you + 0.038*to + 0.031*that + 0.030*is + 0.025*the + 0.023*a + 0.020*it + 0.017*of + 0.016*not + 0.014*are',
 u'0.058*the + 0.034*a + 0.028*to + 0.027*and + 0.020*of + 0.019*in + 0.012*that + 0.011*he + 0.010*was + 0.010*is',
 u"0.056*they + 0.034*to + 0.030*and + 0.028*are + 0.025*their + 0.023*people + 0.021*the + 0.019*them + 0.018*we + 0.012*don't",
 u'0.049*the + 0.034*to + 0.026*a + 0.023*and + 0.022*of + 0.016*for + 0.016*is + 0.015*that + 0.015*in + 0.010*it',
 u'0.096*deleted + 0.031*1 + 0.029*year

##LDA on comments (stop words removed)

We us sklearn's list of stopwords and again generate the list of lists of words, where here we condition on whether the word is contained in the stop list. This produces text lists which are significantly smaller.

In [4]:
from sklearn.feature_extraction import text 
stopwords = text.ENGLISH_STOP_WORDS

In [21]:
%%time

texts2 = []
for text in df.body:
    comment = []
    for word in text.split():
        if word not in stopwords:
            comment.append(word)
    texts2.append(comment)

CPU times: user 1min 13s, sys: 49.7 s, total: 2min 2s
Wall time: 5min 5s


In [22]:
%%time

dictionary2 = gensim.corpora.Dictionary(texts2)

CPU times: user 2min 45s, sys: 9.37 s, total: 2min 55s
Wall time: 2min 54s


##Skip down to load LDA

We can skip producing the corpus and running the LDA, and move down to the point where we reload the LDA models.

In [30]:
%%time

corpus2 = [dictionary2.doc2bow(text) for text in texts2]

CPU times: user 3min 12s, sys: 5min 58s, total: 9min 11s
Wall time: 1h 45min 2s


In [31]:
import json
with open('corpus2.json', 'w') as outfile:
    json.dump(corpus2, outfile)

Now we run LDA with 10, 5, and 2 topics. The topics are noticeably improved, giving some that appear to be plausible collections for a political discussion.

In [32]:
%%time

lda10 = gensim.models.ldamodel.LdaModel(corpus=corpus2, id2word=dictionary2, num_topics=10,
                                      update_every=1, chunksize=10000, passes=1)

CPU times: user 49min 42s, sys: 15 s, total: 49min 57s
Wall time: 50min 17s


In [33]:
lda10.print_topics(10)

[u'0.026*years + 0.019*year + 0.018*1 + 0.016*2 + 0.011*3 + 0.011*10 + 0.010*5 + 0.010*school + 0.009*000 + 0.008*million',
 u"0.018*people + 0.017*gun + 0.015*police + 0.009*guns + 0.007*black + 0.006*cops + 0.006*don't + 0.006*crime + 0.006*it's + 0.006*use",
 u'0.027*law + 0.017*state + 0.016*laws + 0.016*government + 0.014*rights + 0.013*states + 0.012*court + 0.010*federal + 0.009*legal + 0.008*amendment',
 u"0.023*people + 0.019*money + 0.011*government + 0.010*don't + 0.010*it's + 0.008*make + 0.008*just + 0.008*work + 0.007*pay + 0.006*like",
 u"0.012*just + 0.012*don't + 0.012*it's + 0.011*i'm + 0.010*people + 0.010*you're + 0.010*think + 0.009*point + 0.007*that's + 0.007*article",
 u"0.022*just + 0.021*like + 0.015*don't + 0.015*people + 0.015*i'm + 0.014*it's + 0.012*know + 0.008*shit + 0.008*really + 0.008*good",
 u'0.018*vote + 0.015*party + 0.015*obama + 0.013*republicans + 0.011*republican + 0.010*democrats + 0.008*people + 0.008*president + 0.008*voting + 0.007*gop',
 

Notice that some of these topics are quite interesting mixes, that at least seem like plausible topics in a political discussion. Topic 2 contains the mixture of gun, crime, police, cops, black, which is all certainly suggestive of recent issues in the news around police brutality and race relations. Topic 3 appears to encapsulate discussions of state and federal jurisdiction in governance. The last topic appears to contain key words in a discussion of middle class wages and health benefits. While all of the topics are not as compelling, it is promissing that some have produced reasonable groupings of words.

In [35]:
lda10.save('lda10.model')
#lda10 = gensim.models.LdaModel.load('lda10.model')

In [36]:
%%time

lda5 = gensim.models.ldamodel.LdaModel(corpus=corpus2, id2word=dictionary2, num_topics=5,
                                      update_every=1, chunksize=10000, passes=1)

CPU times: user 55min 15s, sys: 13.9 s, total: 55min 28s
Wall time: 55min 35s


In [37]:
lda5.print_topics(5)

[u"0.022*deleted + 0.014*people + 0.009*just + 0.009*work + 0.008*job + 0.007*don't + 0.007*like + 0.007*years + 0.007*year + 0.007*time",
 u"0.017*people + 0.013*just + 0.013*don't + 0.011*it's + 0.010*like + 0.009*i'm + 0.009*think + 0.007*you're + 0.007*know + 0.006*right",
 u"0.007*world + 0.006*it's + 0.004*war + 0.004*american + 0.004*change + 0.004*data + 0.004*years + 0.004*military + 0.003*oil + 0.003*country",
 u"0.015*government + 0.014*money + 0.014*people + 0.008*it's + 0.008*tax + 0.007*don't + 0.006*just + 0.006*pay + 0.006*make + 0.005*want",
 u'0.013*vote + 0.011*party + 0.010*obama + 0.010*republicans + 0.008*just + 0.008*republican + 0.008*like + 0.007*democrats + 0.006*right + 0.006*president']

In [38]:
lda5.save('lda5.model')
#lda5 = gensim.models.LdaModel.load('lda5.model')

In [42]:
%%time

lda2 = gensim.models.ldamodel.LdaModel(corpus=corpus2, id2word=dictionary2, num_topics=2,
                                      update_every=1, chunksize=10000, passes=1)

CPU times: user 52min 9s, sys: 9.66 s, total: 52min 19s
Wall time: 52min 23s


In [43]:
lda2.print_topics(2)

[u"0.012*people + 0.008*money + 0.007*just + 0.007*it's + 0.006*don't + 0.005*work + 0.005*like + 0.005*pay + 0.005*government + 0.005*make",
 u"0.011*people + 0.009*just + 0.008*like + 0.008*it's + 0.008*don't + 0.007*deleted + 0.006*think + 0.006*right + 0.005*i'm + 0.004*know"]

In [44]:
lda2.save('lda2.model')
#lda2 = gensim.models.LdaModel.load('lda2.model')

###LDA with added stop words

Some of the topics above contained superfluous stop words like "it's" and "you're". It would be interesting to remove some of these and see what is produced.

In [14]:
stopwords3 = set(stopwords)
for s in ["it's", "don't", "i'm", "you're", "like", "just", "really", "s", "u", "@"]:
    stopwords3.add(s)

In [15]:
%%time

texts3 = []
for text in df.body:
    comment = []
    for word in text.split():
        if word not in stopwords3:
            comment.append(word)
    texts3.append(comment)

CPU times: user 1min 7s, sys: 38.4 s, total: 1min 46s
Wall time: 3min 49s


In [16]:
%%time

dictionary3 = gensim.corpora.Dictionary(texts3)

CPU times: user 2min 42s, sys: 7.9 s, total: 2min 50s
Wall time: 2min 50s


In [17]:
%%time

corpus3 = [dictionary3.doc2bow(text) for text in texts3]

CPU times: user 2min 41s, sys: 1min 55s, total: 4min 37s
Wall time: 13min 32s


In [18]:
%%time

lda10v3 = gensim.models.ldamodel.LdaModel(corpus=corpus3, id2word=dictionary3, num_topics=10,
                                      update_every=1, chunksize=10000, passes=1)

CPU times: user 48min 15s, sys: 17 s, total: 48min 32s
Wall time: 49min 12s


In [19]:
lda10v3.print_topics(10)

[u'0.023*obama + 0.013*president + 0.013*war + 0.009*bush + 0.009*did + 0.007*american + 0.007*congress + 0.006*right + 0.005*years + 0.005*wing',
 u"0.026*people + 0.014*think + 0.013*know + 0.009*that's + 0.009*good + 0.009*right + 0.009*say + 0.007*want + 0.007*shit + 0.006*going",
 u'0.024*vote + 0.020*party + 0.014*republicans + 0.012*republican + 0.012*state + 0.012*democrats + 0.011*voting + 0.011*people + 0.009*states + 0.008*election',
 u'0.015*people + 0.013*work + 0.011*years + 0.011*time + 0.010*job + 0.007*school + 0.006*live + 0.006*year + 0.005*working + 0.005*kids',
 u'0.019*money + 0.012*tax + 0.011*pay + 0.011*people + 0.008*wage + 0.008*taxes + 0.007*1 + 0.007*income + 0.007*minimum + 0.006*make',
 u'0.017*people + 0.016*gun + 0.014*police + 0.009*guns + 0.006*cops + 0.006*use + 0.006*crime + 0.005*illegal + 0.005*law + 0.005*drug',
 u'0.039*article + 0.024*politics + 0.022*news + 0.019*read + 0.016*post + 0.016*r + 0.015*source + 0.014*thank + 0.013*reason + 0.012*f

While the topics here aren't perfect, there appears to be some improvement in that there are less nonsensical words in the topics.

In [20]:
lda10v3.save('lda10v3.model')
#lda10v3 = gensim.models.LdaModel.load('lda10v3.model')

In [21]:
%%time

lda5v3 = gensim.models.ldamodel.LdaModel(corpus=corpus3, id2word=dictionary3, num_topics=5,
                                      update_every=1, chunksize=10000, passes=1)

CPU times: user 50min 57s, sys: 12.5 s, total: 51min 9s
Wall time: 51min 18s


In [22]:
lda5v3.print_topics(5)

[u'0.013*people + 0.012*money + 0.009*pay + 0.007*work + 0.006*tax + 0.006*government + 0.006*make + 0.004*job + 0.004*wage + 0.004*taxes',
 u'0.022*deleted + 0.013*vote + 0.011*party + 0.011*obama + 0.010*republicans + 0.008*republican + 0.007*democrats + 0.006*state + 0.006*right + 0.006*president',
 u'0.008*read + 0.008*article + 0.008*r + 0.007*politics + 0.007*know + 0.006*said + 0.006*post + 0.006*news + 0.006*information + 0.005*point',
 u'0.012*police + 0.005*time + 0.005*cops + 0.005*law + 0.004*article + 0.004*got + 0.004*did + 0.004*guy + 0.003*car + 0.003*video',
 u"0.026*people + 0.010*think + 0.007*right + 0.007*want + 0.006*make + 0.006*that's + 0.005*know + 0.005*way + 0.005*doesn't + 0.005*say"]

In [23]:
lda5v3.save('lda5v3.model')
#lda5v3 = gensim.models.LdaModel.load('lda5v3.model')

##Skip to here and reload LDA models

Above we saved the LDA models to files and can reload them.

In [23]:
lda10 = gensim.models.LdaModel.load('lda10.model')
lda5 = gensim.models.LdaModel.load('lda5.model')
lda2 = gensim.models.LdaModel.load('lda2.model')

Let's have a look at what the topics look like for an example input

In [24]:
lda5.get_document_topics(dictionary2.doc2bow(texts2[2]))

[(2, 0.25622115743990781), (3, 0.32466806091378592), (4, 0.40376020359691389)]

Now we double check that the df.body series has the same length as the texts2 list. We will have to iterate through texts2 and get the topics for each comment, and then add columns to df.

In [25]:
print len(df.body)==len(texts2)
len(texts2)

True


2677202

##Create LDA columns in the DataFrame

Next we create a dictionary keyed by integers (like the df) containing dictionaries of the topics for each comment in texts2. If a topic is not in the list of topics because the probability is low, then we will set it to zero.

In [52]:
%%time
dict_list_lda10 = []
count = 0
for text in texts2:
    temp_dict = dict(lda10.get_document_topics(dictionary2.doc2bow(text)))
    for i in range(10):
        if i not in temp_dict:
            temp_dict[i] = 0
    dict_list_lda10.append((count, temp_dict))
    count = count + 1
dict_dict_lda10 = dict(dict_list_lda10)

CPU times: user 38min 13s, sys: 1min 37s, total: 39min 51s
Wall time: 51min 20s


Next convert the dictionaries into lists for each LDA topic, which will then become our columns of df.

In [54]:
%%time

lda10_0 = []
lda10_1 = []
lda10_2 = []
lda10_3 = []
lda10_4 = []
lda10_5 = []
lda10_6 = []
lda10_7 = []
lda10_8 = []
lda10_9 = []

for i in range(len(dict_dict_lda10)):
    temp_dict = dict_dict_lda10[i]
    lda10_0.append(temp_dict[0])
    lda10_1.append(temp_dict[1])
    lda10_2.append(temp_dict[2])
    lda10_3.append(temp_dict[3])
    lda10_4.append(temp_dict[4])
    lda10_5.append(temp_dict[0])
    lda10_6.append(temp_dict[1])
    lda10_7.append(temp_dict[2])
    lda10_8.append(temp_dict[3])
    lda10_9.append(temp_dict[4])

CPU times: user 12.5 s, sys: 10.8 s, total: 23.3 s
Wall time: 3min


Now we perfomr the same action on the 5 topic LDA

In [67]:
dict_list_lda5 = []
count = 0
for text in texts2:
    temp_dict = dict(lda5.get_document_topics(dictionary2.doc2bow(text)))
    for i in range(5):
        if i not in temp_dict:
            temp_dict[i] = 0
    dict_list_lda5.append((count, temp_dict))
    count = count + 1
dict_dict_lda5 = dict(dict_list_lda5)

In [79]:
%%time

lda5_0 = []
lda5_1 = []
lda5_2 = []
lda5_3 = []
lda5_4 = []

for i in range(len(dict_dict_lda5)):
    temp_dict = dict_dict_lda5[i]
    lda5_0.append(temp_dict[0])
    lda5_1.append(temp_dict[1])
    lda5_2.append(temp_dict[2])
    lda5_3.append(temp_dict[3])
    lda5_4.append(temp_dict[4])

CPU times: user 5 s, sys: 2.71 s, total: 7.71 s
Wall time: 23.7 s


Here we check that the lengths of all the lists are the same as the length of our df (from above). Then we add columns to df for each list.

In [87]:
len(lda5_0), len(lda5_1), len(lda5_2), len(lda5_3), len(lda5_4)

(2677202, 2677202, 2677202, 2677202, 2677202)

In [90]:
df['lda5_0'] = lda5_0
df['lda5_1'] = lda5_1
df['lda5_2'] = lda5_2
df['lda5_3'] = lda5_3
df['lda5_4'] = lda5_4

In [55]:
len(lda10_0), len(lda10_1), len(lda10_2), len(lda10_3), len(lda10_4), len(lda10_5), len(lda10_6), len(lda10_7), len(lda10_8), len(lda10_9)

(2677202,
 2677202,
 2677202,
 2677202,
 2677202,
 2677202,
 2677202,
 2677202,
 2677202,
 2677202)

In [56]:
df['lda10_0'] = lda10_0
df['lda10_1'] = lda10_1
df['lda10_2'] = lda10_2
df['lda10_3'] = lda10_3
df['lda10_4'] = lda10_4
df['lda10_5'] = lda10_5
df['lda10_6'] = lda10_6
df['lda10_7'] = lda10_7
df['lda10_8'] = lda10_8
df['lda10_9'] = lda10_9

Check the DataFrame to see that our new columsn have been added properly.

In [57]:
df.head()

Unnamed: 0,author,body,controversiality,created_utc,link_id,score,words,sentences,positive,reply_count,question,exclamation,hyperlinks,lda5_0,lda5_1,lda5_2,lda5_3,lda5_4,lda10_0,lda10_1,lda10_2,lda10_3,lda10_4,lda10_5,lda10_6,lda10_7,lda10_8,lda10_9
0,gbs5009,it may be legit in that the progressive group...,0,1390595859,t3_1vyqx3,1,91,5,1,0,1,1,0,0.036305,0.52349,0.032853,0.095054,0.312298,0.0,0.0,0.0,0.0,0.188885,0.0,0.0,0.0,0.0,0.188885
1,bucknuggets,yeah please provide the quotes that show that...,0,1407849031,t3_2daoq2,13,51,3,1,0,0,0,0,0.0,0.57686,0.0,0.23274,0.171899,0.0,0.0,0.0,0.0,0.475752,0.0,0.0,0.0,0.0,0.475752
2,[deleted],was clinton any good though last i remember h...,0,1398952953,t3_24dja2,1,50,4,1,0,1,0,0,0.0,0.0,0.256218,0.32467,0.403762,0.0,0.103011,0.0,0.430058,0.095086,0.0,0.103011,0.0,0.430058,0.095086
3,Dreamstakeroot,what if i told you that both democrats and re...,0,1396628643,t3_226zi6,-8,48,3,0,0,1,0,0,0.0,0.0,0.0,0.611954,0.360255,0.0,0.0,0.0,0.680361,0.0,0.0,0.0,0.0,0.680361,0.0
4,areyakiddinme,happy cake day,0,1414456941,t3_2kgqt2,1,3,1,1,0,0,1,0,0.798848,0.050581,0.050047,0.05014,0.050383,0.025002,0.025,0.025001,0.025001,0.025,0.025002,0.025,0.025001,0.025001,0.025


Save the df for later use.

In [92]:
#df.to_pickle('df2014part2.pkl')
#df = pd.read_pickle('df2014part2.pkl')

In [28]:
#df.to_csv('df2014part2.csv')
df = pd.read_csv('df2014part2.csv', index_col=0)

In [29]:
df.head()

Unnamed: 0,author,body,controversiality,created_utc,link_id,score,words,sentences,positive,reply_count,question,exclamation,hyperlinks,lda5_0,lda5_1,lda5_2,lda5_3,lda5_4,lda10_0,lda10_1,lda10_2,lda10_3,lda10_4,lda10_5,lda10_6,lda10_7,lda10_8,lda10_9
0,gbs5009,it may be legit in that the progressive group...,0,1390595859,t3_1vyqx3,1,91,5,1,0,1,1,0,0.036305,0.52349,0.032853,0.095054,0.312298,0.0,0.0,0.0,0.0,0.188885,0.0,0.0,0.0,0.0,0.188885
1,bucknuggets,yeah please provide the quotes that show that...,0,1407849031,t3_2daoq2,13,51,3,1,0,0,0,0,0.0,0.57686,0.0,0.23274,0.171899,0.0,0.0,0.0,0.0,0.475752,0.0,0.0,0.0,0.0,0.475752
2,[deleted],was clinton any good though last i remember h...,0,1398952953,t3_24dja2,1,50,4,1,0,1,0,0,0.0,0.0,0.256218,0.32467,0.403762,0.0,0.103011,0.0,0.430058,0.095086,0.0,0.103011,0.0,0.430058,0.095086
3,Dreamstakeroot,what if i told you that both democrats and re...,0,1396628643,t3_226zi6,-8,48,3,0,0,1,0,0,0.0,0.0,0.0,0.611954,0.360255,0.0,0.0,0.0,0.680361,0.0,0.0,0.0,0.0,0.680361,0.0
4,areyakiddinme,happy cake day,0,1414456941,t3_2kgqt2,1,3,1,1,0,0,1,0,0.798848,0.050581,0.050047,0.05014,0.050383,0.025002,0.025,0.025001,0.025001,0.025,0.025002,0.025,0.025001,0.025001,0.025


In [43]:
grouped = df.groupby('author')

In [46]:
df_mean = grouped.mean()

In [47]:
df_mean[df_mean['controversiality']>0]

Unnamed: 0_level_0,controversiality,created_utc,score,words,sentences,positive,reply_count,question,exclamation,hyperlinks,lda5_0,lda5_1,lda5_2,lda5_3,lda5_4,lda10_0,lda10_1,lda10_2,lda10_3,lda10_4,lda10_5,lda10_6,lda10_7,lda10_8,lda10_9
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
-----BroAway-----,0.015625,1.402632e+09,2.475000,29.428125,2.031250,0.878125,0.156250,0.281250,0.053125,0.043750,0.149751,0.369235,0.141534,0.105649,0.231508,0.068059,0.052635,0.054621,0.070540,0.241986,0.068059,0.052635,0.054621,0.070540,0.241986
---nobody,0.014286,1.401388e+09,4.357143,97.042857,2.642857,0.914286,0.471429,0.557143,0.014286,0.242857,0.167795,0.471038,0.108493,0.195026,0.050848,0.116825,0.308697,0.089558,0.132026,0.092217,0.116825,0.308697,0.089558,0.132026,0.092217
--TheDoctor--,0.111111,1.390896e+09,2.222222,21.333333,0.777778,1.000000,0.000000,0.000000,0.000000,0.000000,0.091214,0.644312,0.101289,0.025250,0.137935,0.032559,0.102385,0.019997,0.007687,0.244464,0.032559,0.102385,0.019997,0.007687,0.244464
-Andar-,1.000000,1.392584e+09,-1.000000,44.333333,2.666667,0.333333,0.000000,0.000000,0.000000,0.000000,0.022223,0.401566,0.022223,0.439521,0.101076,0.011111,0.026123,0.011111,0.308199,0.233331,0.011111,0.026123,0.011111,0.308199,0.233331
-CORRECT-MY-GRAMMAR-,1.000000,1.408699e+09,1.000000,6.000000,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.040101,0.425027,0.040088,0.041322,0.453463,0.020000,0.020000,0.020006,0.020000,0.555121,0.020000,0.020000,0.020006,0.020000,0.555121
-DocHopper-,0.076923,1.408933e+09,-23.538462,34.076923,2.615385,0.384615,0.000000,0.230769,0.000000,0.000000,0.194231,0.435524,0.064571,0.116886,0.184709,0.084132,0.102220,0.051935,0.113483,0.164154,0.084132,0.102220,0.051935,0.113483,0.164154
-Grazzhoppa,0.111111,1.404743e+09,1.222222,12.444444,1.777778,0.666667,0.000000,0.222222,0.000000,0.000000,0.138463,0.261110,0.041595,0.360018,0.198813,0.116879,0.019922,0.334969,0.110660,0.093659,0.116879,0.019922,0.334969,0.110660,0.093659
-JaM-,0.250000,1.410332e+09,-0.500000,32.250000,3.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.424194,0.377952,0.071879,0.029565,0.091135,0.048267,0.186145,0.048936,0.039661,0.093173,0.048267,0.186145,0.048936,0.039661,0.093173
-LOLOCAUST-,0.111111,1.401006e+09,6.222222,65.000000,4.111111,0.666667,0.000000,0.333333,0.000000,0.000000,0.141580,0.268091,0.116643,0.303608,0.168134,0.048318,0.056129,0.098235,0.275026,0.165116,0.048318,0.056129,0.098235,0.275026,0.165116
-MURS-,0.100000,1.399574e+09,0.000000,8.800000,0.800000,0.500000,0.100000,0.300000,0.000000,0.000000,0.118259,0.499078,0.107927,0.174047,0.097910,0.032834,0.053619,0.032834,0.167492,0.202336,0.032834,0.053619,0.032834,0.167492,0.202336


###Look into the data to see which topics dominate

Of the 2677202 total comments, we look at how many have nonzero topic probabilities for each LDA topic, as well as the min, mean, and max. This gives us an idea of which topics are most frequent in the comments.

In [24]:
lda5_cols = ['lda5_0', 'lda5_1', 'lda5_2', 'lda5_3', 'lda5_4']

In [33]:
for topic in lda5_cols:
    print sum(df[topic]>0) , df[topic].min(), df[topic].mean(), df[topic].max()

2355462 0.0 0.197884117118 0.991362387593
2449491 0.0 0.292549138279 0.991751178752
2374244 0.0 0.13013009033 0.994666666554
2412210 0.0 0.183269960189 0.994712852879
2331553 0.0 0.192707090143 0.999203110957


In [31]:
lda10_cols = ['lda10_0', 'lda10_1', 'lda10_2', 'lda10_3', 'lda10_4', 'lda10_5', 'lda10_6', 'lda10_7', 'lda10_8', 'lda10_9']

In [34]:
for topic in lda10_cols:
    print sum(df[topic]>0), df[topic].min(), df[topic].mean(), df[topic].max()

1811215 0.0 0.0671996176413 0.977497586713
1790817 0.0 0.0861498834688 0.984997100226
1743190 0.0 0.0614827593443 0.979544807258
1965492 0.0 0.135786245312 0.98474310108
2007331 0.0 0.147798837425 0.984998454575
1811215 0.0 0.0671996176413 0.977497586713
1790817 0.0 0.0861498834688 0.984997100226
1743190 0.0 0.0614827593443 0.979544807258
1965492 0.0 0.135786245312 0.98474310108
2007331 0.0 0.147798837425 0.984998454575


####Here we look specifically at the comments tagged controversial.

In [36]:
df_cont = df[df['controversiality']==1]

In [37]:
for topic in lda5_cols:
    print sum(df_cont[topic]>0) , df_cont[topic].min(), df_cont[topic].mean(), df_cont[topic].max()

91146 0.0 0.183967667314 0.988016490365
96184 0.0 0.312209754535 0.988330092951
91429 0.0 0.120556469156 0.977258346735
93660 0.0 0.186279141303 0.986439656221
90313 0.0 0.193433662494 0.984832939073


In [38]:
for topic in lda10_cols:
    print sum(df_cont[topic]>0), df_cont[topic].min(), df_cont[topic].mean(), df_cont[topic].max()

67599 0.0 0.0605644274267 0.949993969388
68789 0.0 0.0937997700929 0.983633265912
66105 0.0 0.0584595570751 0.939997029742
74810 0.0 0.137747592544 0.983925723164
78113 0.0 0.160369227308 0.974999997685
67599 0.0 0.0605644274267 0.949993969388
68789 0.0 0.0937997700929 0.983633265912
66105 0.0 0.0584595570751 0.939997029742
74810 0.0 0.137747592544 0.983925723164
78113 0.0 0.160369227308 0.974999997685


#SVM

Train and SVM to predict positive based on lda5 columns. We have created the positive variable, which indicates whether the score is positive or not. We can try and see if this classification is explained by the mix of topics in a comment.

In [35]:
lda5_cols = ['lda5_0', 'lda5_1', 'lda5_2', 'lda5_3', 'lda5_4']

In [36]:
df[lda5_cols].head()

Unnamed: 0,lda5_0,lda5_1,lda5_2,lda5_3,lda5_4
0,0.036305,0.52349,0.032853,0.095054,0.312298
1,0.0,0.57686,0.0,0.23274,0.171899
2,0.0,0.0,0.256218,0.32467,0.403762
3,0.0,0.0,0.0,0.611954,0.360255
4,0.798848,0.050581,0.050047,0.05014,0.050383


In [37]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(df.shape[0]), train_size=0.7)

In [38]:
mask=np.ones(df.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [39]:
mask

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [40]:
mask.shape, mask.sum()

((2677202,), 1874041)

In [41]:
df.shape

(2677202, 18)

In [42]:
from sklearn.svm import LinearSVC

In [43]:
clfsvm=LinearSVC(loss='hinge')
Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
Xmatrix=df[lda5_cols].values
Yresp=df['positive'].values

In [44]:
Xmatrix_train=Xmatrix[mask]
Xmatrix_test=Xmatrix[~mask]
Yresp_train=Yresp[mask]
Yresp_test=Yresp[~mask]

In [45]:
%%time

from sklearn.grid_search import GridSearchCV
parameters = {'C':Cs}
fitmodel = GridSearchCV(clfsvm, param_grid=parameters, cv=5)
fitmodel.fit(Xmatrix_train, Yresp_train)

CPU times: user 7min 33s, sys: 4.28 s, total: 7min 37s
Wall time: 7min 45s


In [46]:
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.grid_scores_

(LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
      penalty='l2', random_state=None, tol=0.0001, verbose=0),
 {'C': 0.001},
 0.79489029322197324,
 [mean: 0.79489, std: 0.00000, params: {'C': 0.001},
  mean: 0.79489, std: 0.00000, params: {'C': 0.01},
  mean: 0.79489, std: 0.00000, params: {'C': 0.1},
  mean: 0.79489, std: 0.00000, params: {'C': 1.0},
  mean: 0.79489, std: 0.00000, params: {'C': 10.0},
  mean: 0.79489, std: 0.00000, params: {'C': 100.0}])

In [48]:
from sklearn.metrics import accuracy_score
best = fitmodel.best_params_['C']
clfsvm2 = LinearSVC(C=best, loss='hinge')
clfsvm2.fit(Xmatrix_train, Yresp_train)
Ypredict2 = clfsvm2.predict(Xmatrix_test)
accuracy_score(Ypredict2, Yresp_test)

0.79433637838490667

###SVM to predict positive based on lda10 columns

In [60]:
lda10_cols = ['lda10_0', 'lda10_1', 'lda10_2', 'lda10_3', 'lda10_4', 'lda10_5', 'lda10_6', 'lda10_7', 'lda10_8', 'lda10_9']

In [61]:
df[lda10_cols].head()

Unnamed: 0,lda10_0,lda10_1,lda10_2,lda10_3,lda10_4,lda10_5,lda10_6,lda10_7,lda10_8,lda10_9
0,0.0,0.0,0.0,0.0,0.188885,0.0,0.0,0.0,0.0,0.188885
1,0.0,0.0,0.0,0.0,0.475752,0.0,0.0,0.0,0.0,0.475752
2,0.0,0.103011,0.0,0.430058,0.095086,0.0,0.103011,0.0,0.430058,0.095086
3,0.0,0.0,0.0,0.680361,0.0,0.0,0.0,0.0,0.680361,0.0
4,0.025002,0.025,0.025001,0.025001,0.025,0.025002,0.025,0.025001,0.025001,0.025


In [62]:
clfsvm_10=LinearSVC(loss='hinge')
Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
Xmatrix_10=df[lda10_cols].values
Yresp_10=df['positive'].values

In [63]:
Xmatrix_train_10=Xmatrix_10[mask]
Xmatrix_test_10=Xmatrix_10[~mask]
Yresp_train_10=Yresp_10[mask]
Yresp_test_10=Yresp_10[~mask]

In [64]:
%%time

from sklearn.grid_search import GridSearchCV
parameters = {'C':Cs}
fitmodel_10 = GridSearchCV(clfsvm_10, param_grid=parameters, cv=5)
fitmodel_10.fit(Xmatrix_train_10, Yresp_train_10)

CPU times: user 10min 48s, sys: 12.1 s, total: 11min 1s
Wall time: 14min 54s


In [65]:
fitmodel_10.best_estimator_, fitmodel_10.best_params_, fitmodel_10.best_score_, fitmodel_10.grid_scores_

(LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
      penalty='l2', random_state=None, tol=0.0001, verbose=0),
 {'C': 0.001},
 0.79489029322197324,
 [mean: 0.79489, std: 0.00000, params: {'C': 0.001},
  mean: 0.79489, std: 0.00000, params: {'C': 0.01},
  mean: 0.79489, std: 0.00000, params: {'C': 0.1},
  mean: 0.79489, std: 0.00000, params: {'C': 1.0},
  mean: 0.79489, std: 0.00000, params: {'C': 10.0},
  mean: 0.79489, std: 0.00000, params: {'C': 100.0}])

In [66]:
from sklearn.metrics import accuracy_score
best_10 = fitmodel_10.best_params_['C']
clfsvm2_10 = LinearSVC(C=best_10, loss='hinge')
clfsvm2_10.fit(Xmatrix_train_10, Yresp_train_10)
Ypredict2_10 = clfsvm2_10.predict(Xmatrix_test_10)
accuracy_score(Ypredict2_10, Yresp_test_10)

0.79433637838490667

#OLS

Next we attempt to us regression to predict the score based on the topic mix for the 5 topic LDA.

In [39]:
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols

In [50]:
ols_model_lda5 = ols('score ~ lda5_0 + lda5_1 + lda5_2 + lda5_3 + lda5_4', df).fit()
ols_model_lda5

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b5e66fd0>

In [51]:
ols_model_lda5.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,116.4
Date:,"Wed, 09 Dec 2015",Prob (F-statistic):,1.72e-123
Time:,22:03:41,Log-Likelihood:,-12871000.0
No. Observations:,2677202,AIC:,25740000.0
Df Residuals:,2677196,BIC:,25740000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,11.8928,2.590,4.592,0.000,6.817 16.969
lda5_0,-8.1479,2.595,-3.140,0.002,-13.234 -3.062
lda5_1,-7.8589,2.600,-3.022,0.003,-12.955 -2.762
lda5_2,-7.7990,2.597,-3.003,0.003,-12.889 -2.709
lda5_3,-7.1910,2.606,-2.760,0.006,-12.298 -2.084
lda5_4,-6.2100,2.599,-2.389,0.017,-11.304 -1.116

0,1,2,3
Omnibus:,7537719.224,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,576785658601.228
Skew:,36.956,Prob(JB):,0.0
Kurtosis:,2275.703,Cond. No.,387.0


And we can look specifically at the controversial comments to see if there is a different relationship.

In [40]:
ols_model_lda5_cont = ols('score ~ lda5_0 + lda5_1 + lda5_2 + lda5_3 + lda5_4', df_cont).fit()
ols_model_lda5_cont

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x317920150>

In [41]:
ols_model_lda5_cont.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,8.746
Date:,"Thu, 10 Dec 2015",Prob (F-statistic):,2.64e-08
Time:,20:02:53,Log-Likelihood:,-300210.0
No. Observations:,104095,AIC:,600400.0
Df Residuals:,104089,BIC:,600500.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,5.6671,1.929,2.937,0.003,1.886 9.449
lda5_0,-5.4078,1.934,-2.796,0.005,-9.198 -1.617
lda5_1,-5.1835,1.938,-2.675,0.007,-8.981 -1.386
lda5_2,-4.9205,1.936,-2.542,0.011,-8.714 -1.127
lda5_3,-5.1791,1.942,-2.667,0.008,-8.985 -1.373
lda5_4,-5.0582,1.934,-2.616,0.009,-8.849 -1.268

0,1,2,3
Omnibus:,132256.487,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,93355742.757
Skew:,6.485,Prob(JB):,0.0
Kurtosis:,149.136,Cond. No.,390.0


##Summary

Generally, we were able to observe intersting topics and trends in the comment data, however we were unable to accurately predict score or positive reception of comments based on the features we considered. We could consider doing more detail feature extraction, looking for nouns and adjectives, or running LDA on just the controversial comments to see if there are more pronounced topics in that data subset. It would also be interesting to analyse how comment thread length, and original article topics affect the tone of the political conversations.