# Topic Modeling

In [27]:
# Imports
from gensim import matutils, models
import scipy.sparse

## Load data

In [29]:
import pandas as pd

vec_df = pd.read_csv('saves/3.stopwords_vectorized_df.csv', index_col = 0).transpose()
vec_df

Unnamed: 0,Lousic C.K.,Dave Chappelle,Ricky Gervais,Bo Burham,Bill Burr,Jim Jefferies,John Mulaney,Hasan Minhaj,Ali Wong,Anthony Jeselnik,Mike Birbiglia,Joe Rogan
aaaaah,0,0,0,0,1,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
zillion,0,0,0,0,1,0,0,0,0,0,0,0
zombie,0,0,0,0,2,0,0,0,1,0,0,0
zone,0,0,0,0,1,0,0,0,0,0,0,0
zoo,0,0,1,0,0,0,0,0,0,0,0,0


## Overall topic modeling

In [30]:
sparse_counts = scipy.sparse.csr_matrix(vec_df)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [31]:
import pickle

vectorizer = pickle.load(open("saves/3.vectorizer.pkl", "rb"))
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [34]:
lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 2, passes = 10)
lda.print_topics()

[(0,
  '0.007*"dad" + 0.006*"tell" + 0.006*"ve" + 0.006*"love" + 0.004*"day" + 0.004*"yeah" + 0.004*"really" + 0.004*"school" + 0.004*"mean" + 0.004*"joke"'),
 (1,
  '0.009*"shit" + 0.008*"yeah" + 0.007*"man" + 0.007*"ve" + 0.006*"kid" + 0.006*"day" + 0.006*"woman" + 0.006*"little" + 0.005*"cause" + 0.005*"life"')]

In [35]:
lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 3, passes = 10)
lda.print_topics()

[(0,
  '0.009*"ve" + 0.009*"joke" + 0.009*"yeah" + 0.007*"day" + 0.007*"love" + 0.006*"little" + 0.005*"tell" + 0.005*"year" + 0.005*"woman" + 0.005*"shit"'),
 (1,
  '0.009*"shit" + 0.007*"man" + 0.007*"kid" + 0.006*"yeah" + 0.006*"ve" + 0.006*"cause" + 0.006*"life" + 0.006*"day" + 0.006*"really" + 0.005*"tell"'),
 (2,
  '0.001*"yeah" + 0.001*"ve" + 0.000*"man" + 0.000*"kid" + 0.000*"tell" + 0.000*"cause" + 0.000*"life" + 0.000*"love" + 0.000*"shit" + 0.000*"day"')]

In [36]:
lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 4, passes = 10)
lda.print_topics()

[(0,
  '0.009*"man" + 0.009*"shit" + 0.008*"kid" + 0.008*"cause" + 0.008*"life" + 0.007*"ve" + 0.007*"woman" + 0.006*"really" + 0.006*"day" + 0.005*"girl"'),
 (1,
  '0.011*"shit" + 0.010*"man" + 0.009*"ahah" + 0.008*"black" + 0.007*"woman" + 0.006*"rape" + 0.005*"ve" + 0.005*"walk" + 0.005*"lot" + 0.005*"gay"'),
 (2,
  '0.009*"yeah" + 0.008*"ve" + 0.007*"day" + 0.006*"kid" + 0.006*"tell" + 0.006*"love" + 0.006*"dad" + 0.005*"year" + 0.005*"joke" + 0.005*"shit"'),
 (3,
  '0.008*"love" + 0.007*"shit" + 0.006*"yeah" + 0.006*"bo" + 0.006*"man" + 0.006*"ok" + 0.006*"stuff" + 0.005*"ve" + 0.005*"woman" + 0.005*"repeat"')]

In [37]:
lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 5, passes = 10)
lda.print_topics()

[(0,
  '0.012*"life" + 0.010*"tit" + 0.008*"cause" + 0.008*"kid" + 0.008*"shit" + 0.008*"old" + 0.007*"happen" + 0.007*"murder" + 0.006*"woman" + 0.006*"mean"'),
 (1,
  '0.011*"ve" + 0.009*"dad" + 0.007*"love" + 0.007*"day" + 0.007*"man" + 0.006*"girl" + 0.006*"gun" + 0.006*"kid" + 0.005*"tell" + 0.005*"life"'),
 (2,
  '0.013*"shit" + 0.008*"yeah" + 0.008*"man" + 0.007*"kid" + 0.007*"tell" + 0.006*"woman" + 0.006*"dude" + 0.006*"day" + 0.005*"ve" + 0.005*"lot"'),
 (3,
  '0.008*"love" + 0.007*"little" + 0.006*"ve" + 0.006*"walk" + 0.006*"yeah" + 0.006*"bo" + 0.005*"clinton" + 0.005*"old" + 0.005*"way" + 0.005*"stuff"'),
 (4,
  '0.010*"yeah" + 0.009*"ve" + 0.006*"mean" + 0.006*"cause" + 0.006*"year" + 0.006*"joke" + 0.006*"day" + 0.005*"really" + 0.005*"little" + 0.005*"jenny"')]

## Nouns topic modeling

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')

