In [1]:
import nltk
import pickle
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
import string


In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import	stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.lancaster import	LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction import text 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities, matutils

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
with open('reddit_data','rb') as read_file:
    df_all_reddit = pickle.load(read_file)

In [4]:
df_all_reddit.columns = ['subreddit', 'comments_list']

In [5]:
df_all_reddit.head()

Unnamed: 0,subreddit,comments_list
0,documentaries,[There's nothing hypothetical about what ISPs ...
1,fitness,[It feels like this was written specifically f...
2,adviceanimals,[For fucks sake the previous post triggered AM...
3,oddlysatisfying,"[> Hi I made this, my name is Lawrence Becker ..."
4,music,[***Wanna do something about Net Neutrality??*...


### Dropping 2 subreddits that are overrepresented

In [8]:
df_all_reddit = df_all_reddit.drop(df_all_reddit[df_all_reddit.subreddit == 'leagueoflegends'].index)
df_all_reddit = df_all_reddit.drop(df_all_reddit[df_all_reddit.subreddit == 'writingprompts'].index)

In [9]:
df_all_reddit = df_all_reddit.reset_index(drop=True)

### Lower casing, remove alphanumerics

In [10]:
alphanumeric = lambda x: re.sub('\w*\d\w*', '', x)
punc_lower_1 = lambda x: re.sub(r'[^\w\s]', '', x.lower())
punc_lower_2 = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())
stemmer = PorterStemmer()

In [11]:
def cleaning(cell):
    temp_cell = list(map(alphanumeric, cell))
    temp_cell = list(map(punc_lower_1, temp_cell))
    return list(map(punc_lower_2, temp_cell))

In [12]:
def stemming(string_list):
    clean_list = []
    for string in string_list:
        clean_word = ''
        for word in string.split():
            clean_word = clean_word + ' ' + stemmer.stem(word)
        clean_list.append(clean_word)
    return clean_list

In [13]:
df_all_reddit.comments_list = df_all_reddit.comments_list.apply(cleaning)
df_all_reddit.comments_list = df_all_reddit.comments_list.apply(stemming)

In [14]:
df_all_reddit.head()

Unnamed: 0,subreddit,comments_list
0,documentaries,[ there noth hypothet about what isp will do w...
1,fitness,[ it feel like thi wa written specif for mysel...
2,adviceanimals,[ for fuck sake the previou post trigger am wh...
3,oddlysatisfying,[ hi i made thi my name is lawrenc becker and ...
4,music,[ wanna do someth about net neutral here how t...


In [15]:
with open('clean_reddit_data', 'wb') as to_write:
    pickle.dump(df_all_reddit, to_write)

### Applying Count Vectorizer

In [17]:
custom_stop_words = ['pokemon']

In [18]:
new_stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words)

In [19]:
comments = df_all_reddit.comments_list

In [20]:
token = CountVectorizer().build_tokenizer()

In [61]:
cv = CountVectorizer(tokenizer = lambda x: token(''.join(x)),  
                     lowercase = False, min_df = 0.15 ,max_df = 0.70, stop_words = new_stop_words, ngram_range=(1, 2))

In [62]:
word_matrix = cv.fit_transform(comments)

In [63]:
vectorized_subs = pd.DataFrame(word_matrix.toarray(), index=df_all_reddit.subreddit, columns=cv.get_feature_names())

In [64]:
vectorized_subs.head()

Unnamed: 0_level_0,ab,abandon,abil share,abl afford,abl make,abl use,abolish,abort,abroad,absolut amaz,...,youv seen,yr,yr old,yup,zealand,zip,zombi,zone,zoo,zoom
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
documentaries,0,0,0,0,0,0,0,0,3,0,...,0,1,1,0,3,0,0,0,0,0
fitness,13,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
adviceanimals,0,0,0,0,0,0,1,3,0,0,...,0,0,0,0,0,0,1,0,0,0
oddlysatisfying,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
music,0,1,1,0,0,0,0,1,2,0,...,0,0,0,0,0,1,0,0,0,0


In [65]:
vectorized_subs.shape

(98, 7455)

## LSA aka SVD

In [38]:
lsa = TruncatedSVD(8, random_state=30)
lsa_topics = lsa.fit_transform(word_matrix)
lsa.explained_variance_ratio_

array([0.09436618, 0.07246427, 0.06142212, 0.0547332 , 0.04138803,
       0.03977643, 0.03637122, 0.03149917])

In [39]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [40]:
display_topics(lsa, cv.get_feature_names(), 20)


Topic  0
neutral, net, net neutral, trump, presid, fcc, tax, data, senat, ban, republican, sex, isp, educ, email, elect, legal, app, teacher, batteri

Topic  1
sex, relationship, partner, remov remov, philosophi, tax, church, educ, sexual, condom, boyfriend, marriag, husband, cum, librari, brand, marri, argument, languag, congratul

Topic  2
trump, church, presid, tax, sex, russian, republican, russia, religion, nbsp, elect, investig, religi, primari, senat, christian, violat, white hous, climat, congress

Topic  3
sex, partner, relationship, neutral, net, net neutral, sexual, cum, condom, boyfriend, fcc, marriag, marri, husband, make feel, isp, porn, sexi, netflix, pleasur

Topic  4
trump, batteri, app, android, headphon, appl, jack, presid, devic, iphon, samsung, sex, russian, bank, russia, featur, nbsp, amazon, switch, microsoft

Topic  5
church, tax, batteri, app, appl, android, religion, headphon, religi, jack, devic, christian, iphon, cathol, samsung, amazon, exempt, gun, featur

### NMF (Non-Negative Matrix Factorization)

In [66]:
nmf_model = NMF(8)
nmf_topics = nmf_model.fit_transform(word_matrix)

In [67]:
nmf_model.components_.shape

(8, 7455)

In [68]:
display_topics(nmf_model, cv.get_feature_names(), 20)


Topic  0
neutral, net, net neutral, fcc, isp, netflix, senat, gift, ban, comcast, email, song, donat, admin, verizon, teacher, republican, pai, throttl, data

Topic  1
brand, librari, languag, militari, signal, cell, presid, blood, remov remov, underground, tax, muscl, plant, releas, weight, code, inspir, data, educ, chemic

Topic  2
trump, presid, russian, republican, russia, senat, elect, investig, nbsp, primari, congress, ban, net, email, neutral, white hous, net neutral, climat, democrat, violat

Topic  3
sex, relationship, partner, sexual, condom, cum, boyfriend, marriag, marri, husband, make feel, emot, pressur, congratul, porn, thank share, sexi, girlfriend, pleasur, congrat

Topic  4
batteri, app, appl, android, headphon, devic, jack, iphon, amazon, samsung, featur, data, code, microsoft, upgrad, spotifi, download, switch, pc, review

Topic  5
church, tax, religion, religi, christian, trump, cathol, gun, exempt, cult, legal, economi, prayer, ban, abus, abort, satan, pay tax, b

In [31]:
documents_to_topics = pd.DataFrame(nmf_topics.round(3),
             index = df_all_reddit.subreddit,
             columns = ["internet","education","politics","relationships",
                       "technology","religion", "financial", "other"])
documents_to_topics.head()

ValueError: Shape of passed values is (8, 98), indices imply (7, 98)

In [46]:
documents_to_topics.sort_values(by = 'internet', ascending = False)

Unnamed: 0_level_0,internet,education,politics,relationships,technology,religion,financial
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
blog,11.980,1.089,0.000,0.044,0.000,0.274,1.207
technology,10.495,0.000,2.207,0.000,4.399,0.000,0.000
announcements,7.257,0.393,2.539,0.000,0.845,0.982,1.034
listentothis,5.276,0.409,0.000,0.234,0.198,0.000,0.000
television,4.817,0.921,1.204,1.228,0.000,0.024,0.000
iama,4.802,1.551,1.073,0.105,0.000,1.305,0.873
books,3.572,3.179,0.070,0.516,0.000,0.180,0.295
news,3.383,1.633,3.007,0.565,1.025,0.239,1.436
worldnews,3.338,1.546,6.302,0.118,0.149,0.030,0.409
videos,3.241,2.105,0.334,0.426,0.059,0.050,1.500


### Latent Dirichlet Allocation

In [363]:
cv_lda = CountVectorizer(tokenizer = lambda x: token(''.join(x)),  
                     lowercase = False, min_df = 0.15 ,max_df = 0.70, stop_words = new_stop_words, ngram_range=(1, 2))
cv_lda.fit(comments)


In [366]:
doc_word = cv_lda.transform(comments).transpose()

In [347]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [349]:
len(id2word)

7903

In [368]:
pd.DataFrame(doc_word.toarray(), cv_lda.get_feature_names(), columns = df_all_reddit.subreddit).head()

subreddit,documentaries,fitness,adviceanimals,oddlysatisfying,music,atheism,sex,gonewild,malefashionadvice,philosophy,...,listentothis,blog,natureisfuckinglit,pcmasterrace,askreddit,science,futureology,nsfw,historyporn,writingprompts
ab,0,13,0,0,0,0,0,3,0,0,...,1,1,0,0,2,0,0,2,0,1
abandon,0,0,0,0,1,1,0,0,0,0,...,0,3,0,0,1,0,0,0,1,22
abil share,0,0,0,0,1,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,1
abl afford,0,0,0,0,0,1,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0
abl make,0,0,0,1,0,0,2,0,0,2,...,0,2,0,0,0,1,0,0,0,5


In [369]:
corpus = matutils.Sparse2Corpus(doc_word)

In [353]:
corpus.shape

(7903, 100)

In [371]:
lda = models.LdaModel(corpus=corpus, num_topics=7, id2word=id2word, passes=10)

2019-05-20 16:19:41,156 : INFO : using symmetric alpha at 0.14285714285714285
2019-05-20 16:19:41,157 : INFO : using symmetric eta at 0.14285714285714285
2019-05-20 16:19:41,159 : INFO : using serial LDA version on this node
2019-05-20 16:19:41,166 : INFO : running online (multi-pass) LDA training, 7 topics, 10 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence threshold of 0.001000
2019-05-20 16:19:43,315 : INFO : -9.568 per-word bound, 758.9 perplexity estimate based on a held-out corpus of 100 documents with 523442 words
2019-05-20 16:19:43,316 : INFO : PROGRESS: pass 0, at document #100/100
2019-05-20 16:19:43,652 : INFO : topic #5 (0.143): 0.004*"trump" + 0.003*"neutral" + 0.003*"net" + 0.002*"presid" + 0.002*"net neutral" + 0.002*"tax" + 0.001*"ban" + 0.001*"remov remov" + 0.001*"smoke" + 0.001*"data"
2019-05-20 16:19:43,653 : INFO : topic #3 (0.143): 0.004*"net" + 

In [372]:
lda.print_topics()

2019-05-20 16:22:16,247 : INFO : topic #0 (0.143): 0.005*"trump" + 0.004*"presid" + 0.004*"code" + 0.004*"educ" + 0.004*"prison" + 0.003*"philosophi" + 0.003*"neutral" + 0.003*"argument" + 0.003*"program" + 0.003*"data"
2019-05-20 16:22:16,249 : INFO : topic #1 (0.143): 0.160*"christ" + 0.158*"jesu christ" + 0.020*"tattoo" + 0.008*"bamboozl" + 0.006*"rall" + 0.006*"frog" + 0.004*"wednesday" + 0.004*"artist" + 0.004*"scar" + 0.004*"ban"
2019-05-20 16:22:16,250 : INFO : topic #2 (0.143): 0.005*"moon" + 0.004*"paint" + 0.003*"began" + 0.003*"stare" + 0.003*"bob" + 0.002*"seat" + 0.002*"alien" + 0.002*"scream" + 0.002*"window" + 0.002*"ship"
2019-05-20 16:22:16,252 : INFO : topic #3 (0.143): 0.010*"net" + 0.010*"neutral" + 0.009*"net neutral" + 0.005*"trump" + 0.004*"fcc" + 0.004*"ea" + 0.003*"song" + 0.003*"ban" + 0.003*"isp" + 0.002*"senat"
2019-05-20 16:22:16,252 : INFO : topic #4 (0.143): 0.004*"remov remov" + 0.004*"inspir" + 0.004*"weight" + 0.003*"data" + 0.003*"jar" + 0.003*"hawk" 

[(0,
  '0.005*"trump" + 0.004*"presid" + 0.004*"code" + 0.004*"educ" + 0.004*"prison" + 0.003*"philosophi" + 0.003*"neutral" + 0.003*"argument" + 0.003*"program" + 0.003*"data"'),
 (1,
  '0.160*"christ" + 0.158*"jesu christ" + 0.020*"tattoo" + 0.008*"bamboozl" + 0.006*"rall" + 0.006*"frog" + 0.004*"wednesday" + 0.004*"artist" + 0.004*"scar" + 0.004*"ban"'),
 (2,
  '0.005*"moon" + 0.004*"paint" + 0.003*"began" + 0.003*"stare" + 0.003*"bob" + 0.002*"seat" + 0.002*"alien" + 0.002*"scream" + 0.002*"window" + 0.002*"ship"'),
 (3,
  '0.010*"net" + 0.010*"neutral" + 0.009*"net neutral" + 0.005*"trump" + 0.004*"fcc" + 0.004*"ea" + 0.003*"song" + 0.003*"ban" + 0.003*"isp" + 0.002*"senat"'),
 (4,
  '0.004*"remov remov" + 0.004*"inspir" + 0.004*"weight" + 0.003*"data" + 0.003*"jar" + 0.003*"hawk" + 0.002*"map" + 0.002*"mass" + 0.002*"sun" + 0.002*"gym"'),
 (5,
  '0.004*"tax" + 0.004*"church" + 0.003*"trump" + 0.003*"smoke" + 0.002*"legal" + 0.002*"presid" + 0.002*"teacher" + 0.002*"vaccin" + 0.00

## Using Tfidf (TFIDF GIVES ME BETTER TOPICS)

In [50]:
tfidf = TfidfVectorizer(tokenizer = lambda x: token(''.join(x)),  
                     lowercase = False, min_df = 0.15 ,max_df = 0.70, stop_words = new_stop_words, ngram_range=(1, 2))

In [52]:
tfidf_matrix = tfidf.fit_transform(comments)

In [53]:
tfidf_vectorized_subs = pd.DataFrame(tfidf_matrix.toarray(), index=df_all_reddit.subreddit, columns=tfidf.get_feature_names())

In [54]:
tfidf_vectorized_subs.head()

Unnamed: 0_level_0,ab,abandon,abil share,abl afford,abl make,abl use,abolish,abort,abroad,absolut amaz,...,youv seen,yr,yr old,yup,zealand,zip,zombi,zone,zoo,zoom
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
documentaries,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015947,0.0,...,0.0,0.004288,0.005224,0.0,0.017186,0.0,0.0,0.0,0.0,0.0
fitness,0.076295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adviceanimals,0.0,0.0,0.0,0.0,0.0,0.0,0.008539,0.022727,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.007864,0.0,0.0,0.0
oddlysatisfying,0.0,0.0,0.0,0.0,0.015083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
music,0.0,0.003966,0.00518,0.0,0.0,0.0,0.0,0.004595,0.009028,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004514,0.0,0.0,0.0,0.0


### SVD Using TFIDF

In [63]:
lsa_tfidf = TruncatedSVD(7, random_state=30)
lsa_topics_tfidf = lsa_tfidf.fit_transform(tfidf_matrix)

In [64]:
display_topics(lsa_tfidf, cv.get_feature_names(), 20)


Topic  0
net neutral, neutral, net, trump, fcc, presid, ea, isp, senat, data, remov remov, vaccin, ban, gif, teacher, tax, batteri, elect, song, sex

Topic  1
net neutral, neutral, net, trump, fcc, isp, pai, senat, republican, comcast, ajit, verizon, repeal, congress, elect, ea, throttl, netflix, email, presid

Topic  2
trump, vaccin, presid, tax, church, remov remov, elect, suicid, abort, republican, autism, legal, health, investig, cathol, religion, obama, philosophi, politician, rape

Topic  3
sexi, tit, gorgeou, boob, trump, gif, pussi, tattoo, titti, porn, automat pleas, perform automat, question concern, bot thi, cum, action wa, pleas contact, wa perform, net neutral, stun

Topic  4
ea, pc, sexi, tit, boob, gorgeou, sex, player, porn, headphon, gamer, batteri, cum, titti, pussi, consol, steam, trump, thi game, vaccin

Topic  5
ea, pc, player, jar, star war, season, thi game, gamer, disney, actor, jon, steam, trailer, snow, epic, httpsiimgurcomjpg, santa, hype, episod, battlefron

### NMF Using TFIDF - This is my champion (most separable topics!)

In [68]:
# NMF

nmf_tfidf = NMF(8)
nmf_topics_tfidf = nmf_tfidf.fit_transform(tfidf_matrix)

In [69]:
display_topics(nmf_tfidf, tfidf.get_feature_names(), 20)


Topic  0
sex, remov remov, relationship, gym, prison, educ, philosophi, tax, suicid, church, health, medic, teacher, depress, drug, husband, doctor, advic, congratul, weight

Topic  1
net neutral, neutral, net, fcc, isp, senat, song, pai, comcast, netflix, repeal, verizon, ajit, throttl, email, congress, ajit pai, prison, band, republican

Topic  2
trump, presid, vaccin, elect, tax, republican, russia, russian, obama, ban, investig, senat, protest, eu, pardon, church, putin, admin, politician, white hous

Topic  3
sexi, tit, boob, gorgeou, porn, pussi, titti, gif, cum, sex, thigh, hip, stun, chick, breast, nake, verifi, brighten, flair, cam

Topic  4
ea, pc, gamer, steam, player, consol, thi game, star war, loot, releas, dlc, battlefront, epic, soni, gambl, monitor, destini, cancer, purchas, dem

Topic  5
gif, ice, shirt, comic, snow, frog, spider, bag, film, jar, stadium, vet, creepi, teacher, watch thi, bamboozl, smell, ador, gun, pizza

Topic  6
mountain, remov remov, hike, moon, p

In [87]:
documents_to_topics_tfidf = pd.DataFrame(nmf_topics_tfidf.round(3),
             index = df_all_reddit.subreddit,
             columns = ["education","internet","politics","relationships",
                       "gaming","funny_memes", "art", "technology"])
documents_to_topics_tfidf.head()

Unnamed: 0_level_0,education,internet,politics,relationships,gaming,funny_memes,art,technology
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
documentaries,0.261,0.143,0.102,0.0,0.003,0.065,0.012,0.0
fitness,0.229,0.0,0.0,0.05,0.0,0.006,0.006,0.0
adviceanimals,0.134,0.025,0.338,0.0,0.112,0.045,0.0,0.0
oddlysatisfying,0.0,0.0,0.0,0.0,0.0,0.298,0.141,0.006
music,0.057,0.201,0.014,0.0,0.001,0.137,0.01,0.0


In [95]:
with open('topics', 'wb') as to_write:
    pickle.dump(documents_to_topics_tfidf, to_write)

In [105]:
documents_to_topics_tfidf.sort_values(by = 'technology', ascending = False)

Unnamed: 0_level_0,education,internet,politics,relationships,gaming,funny_memes,art,technology
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gadgets,0.000,0.027,0.000,0.000,0.000,0.000,0.000,0.717
android,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.711
programming,0.076,0.016,0.011,0.000,0.045,0.000,0.042,0.299
crappydesign,0.000,0.275,0.000,0.005,0.000,0.116,0.000,0.253
dataisbeautiful,0.056,0.005,0.027,0.000,0.000,0.014,0.221,0.246
internetisbeautiful,0.045,0.255,0.000,0.000,0.001,0.048,0.102,0.208
lifehacks,0.022,0.000,0.000,0.000,0.000,0.256,0.000,0.194
technology,0.000,0.537,0.083,0.000,0.000,0.000,0.000,0.190
diy,0.008,0.000,0.000,0.000,0.000,0.235,0.183,0.185
buildapc,0.018,0.121,0.000,0.002,0.254,0.000,0.016,0.183
