In [29]:
# Basic functionalities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [30]:
# Topic Modeling
from gensim import matutils, models
from gensim.models import LsiModel
import scipy.sparse

# Latent Dirichlet Allocation (LDA)

In [31]:
def train_LDA_model(data_matrix, vectorizer, num_topics=4, passes=500):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lda = models.LdaModel(corpus=gensim_corpus, id2word=id2word, num_topics=num_topics, passes=passes)
    
    corpus_transformed = lda[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lda, result

In [32]:
# Load all matrices
clean_corpus_dtm = pd.read_pickle('./pickles/corpus0_dtm.pkl')
clean_corpus_tim = pd.read_pickle('./pickles/corpus0_tim.pkl')
corpus_noun_dtm = pd.read_pickle('./pickles/corpus1_dtm.pkl')
corpus_noun_tim = pd.read_pickle('./pickles/corpus1_tim.pkl')
corpus_na_dtm = pd.read_pickle('./pickles/corpus2_dtm.pkl')
corpus_na_tim = pd.read_pickle('./pickles/corpus2_tim.pkl')

In [33]:
# Load all vectorizers
clean_corpus_cv = pickle.load(open('./pickles/corpus0_cv.pkl', 'rb'))
clean_corpus_tf_idf = pickle.load(open('./pickles/corpus0_tf_idf.pkl','rb'))
corpus_noun_cv = pickle.load(open('./pickles/corpus1_cv.pkl','rb'))
corpus_noun_tf_idf = pickle.load(open('./pickles/corpus1_tf_idf.pkl','rb'))
corpus_na_cv = pickle.load(open('./pickles/corpus2_cv.pkl','rb'))
corpus_na_tf_idf = pickle.load(open('./pickles/corpus2_tf_idf.pkl', 'rb'))

In [6]:
clean_corpus_lda, result1 = train_LDA_model(clean_corpus_dtm, clean_corpus_cv)
clean_corpus_lda.print_topics()

[(0,
  '0.017*"bitch" + 0.015*"man" + 0.014*"mom" + 0.009*"harry" + 0.008*"gon" + 0.008*"car" + 0.008*"okay" + 0.008*"slipper" + 0.007*"real" + 0.007*"ima"'),
 (1,
  '0.016*"black" + 0.013*"accent" + 0.011*"good" + 0.009*"white" + 0.009*"man" + 0.008*"hes" + 0.006*"trevor" + 0.006*"cat" + 0.006*"friends" + 0.006*"thing"'),
 (2,
  '0.012*"pool" + 0.012*"phone" + 0.008*"woman" + 0.008*"thing" + 0.008*"house" + 0.007*"gon" + 0.007*"guy" + 0.007*"god" + 0.007*"man" + 0.006*"motherfucker"'),
 (3,
  '0.014*"train" + 0.010*"new" + 0.010*"money" + 0.009*"god" + 0.008*"baby" + 0.008*"year" + 0.007*"chinese" + 0.007*"doors" + 0.007*"man" + 0.006*"kids"')]

In [7]:
result1

[(3, 'Amy Schumer'),
 (2, 'Arsenio Hall'),
 (3, 'Aziz Ansari'),
 (2, 'CHRIS ROCK'),
 (2, 'Chris Rock'),
 (3, 'Dave Chappelle'),
 (1, 'Hasan Minhaj'),
 (2, 'JACK WHITEHALL'),
 (0, 'JO KOY'),
 (1, 'Jimmy O. Yang'),
 (0, 'Jo Koy'),
 (1, 'Joe Rogan'),
 (0, 'Kevin Hart'),
 (1, 'MICHAEL CHE'),
 (0, 'Michael McIntyre'),
 (0, 'Mike Epps'),
 (1, 'Neal Brennan'),
 (3, 'Ronny Chieng'),
 (0, 'Russell Peters'),
 (2, 'Sebastian Maniscalco'),
 (1, 'Trevor Noah'),
 (3, 'Vir Das'),
 (0, 'Whitney Cummings')]

In [8]:
clean_corpus_lda, result2 = train_LDA_model(clean_corpus_tim, clean_corpus_tf_idf)
clean_corpus_lda.print_topics()

[(0,
  '0.002*"phone" + 0.002*"im" + 0.002*"shit" + 0.001*"youre" + 0.001*"train" + 0.001*"cat" + 0.001*"baby" + 0.001*"time" + 0.001*"people" + 0.001*"right"'),
 (1,
  '0.002*"black" + 0.002*"mom" + 0.001*"white" + 0.001*"dad" + 0.001*"people" + 0.001*"shit" + 0.001*"thats" + 0.001*"rights" + 0.001*"friends" + 0.001*"civil"'),
 (2,
  '0.002*"russian" + 0.001*"sexy" + 0.001*"men" + 0.001*"im" + 0.001*"sign" + 0.001*"diarrhea" + 0.001*"jumper" + 0.001*"pool" + 0.001*"hotel" + 0.001*"active"'),
 (3,
  '0.002*"slipper" + 0.002*"ethiopians" + 0.001*"accent" + 0.001*"josep" + 0.001*"keys" + 0.001*"money" + 0.001*"black" + 0.001*"sex" + 0.001*"yall" + 0.001*"man"')]

In [9]:
result2

[(0, 'Amy Schumer'),
 (0, 'Arsenio Hall'),
 (0, 'Aziz Ansari'),
 (1, 'CHRIS ROCK'),
 (0, 'Chris Rock'),
 (3, 'Dave Chappelle'),
 (1, 'Hasan Minhaj'),
 (2, 'JACK WHITEHALL'),
 (3, 'JO KOY'),
 (1, 'Jimmy O. Yang'),
 (1, 'Jo Koy'),
 (0, 'Joe Rogan'),
 (0, 'Kevin Hart'),
 (1, 'MICHAEL CHE'),
 (2, 'Michael McIntyre'),
 (3, 'Mike Epps'),
 (1, 'Neal Brennan'),
 (0, 'Ronny Chieng'),
 (2, 'Russell Peters'),
 (0, 'Sebastian Maniscalco'),
 (3, 'Trevor Noah'),
 (0, 'Vir Das'),
 (3, 'Whitney Cummings')]

In [10]:
corpus_noun_lda, result3 = train_LDA_model(corpus_noun_dtm, corpus_noun_cv)
corpus_noun_lda.print_topics()

[(0,
  '0.015*"accent" + 0.014*"mom" + 0.011*"good" + 0.010*"black" + 0.009*"slipper" + 0.007*"trevor" + 0.007*"josep" + 0.006*"white" + 0.006*"hey" + 0.006*"dad"'),
 (1,
  '0.012*"pool" + 0.011*"right" + 0.011*"train" + 0.010*"money" + 0.009*"new" + 0.007*"guy" + 0.006*"chinese" + 0.006*"doors" + 0.006*"okay" + 0.006*"fuck"'),
 (2,
  '0.014*"bitch" + 0.010*"right" + 0.008*"fuck" + 0.008*"house" + 0.008*"okay" + 0.007*"oh" + 0.007*"good" + 0.007*"cause" + 0.007*"god" + 0.007*"harry"'),
 (3,
  '0.021*"phone" + 0.017*"russian" + 0.010*"black" + 0.009*"yeah" + 0.009*"white" + 0.007*"friends" + 0.006*"new" + 0.006*"hey" + 0.005*"years" + 0.004*"motherfucker"')]

In [11]:
result3

[(2, 'Amy Schumer'),
 (3, 'Arsenio Hall'),
 (0, 'Aziz Ansari'),
 (2, 'CHRIS ROCK'),
 (1, 'Chris Rock'),
 (3, 'Dave Chappelle'),
 (2, 'Hasan Minhaj'),
 (1, 'JACK WHITEHALL'),
 (0, 'JO KOY'),
 (0, 'Jimmy O. Yang'),
 (0, 'Jo Koy'),
 (2, 'Joe Rogan'),
 (2, 'Kevin Hart'),
 (1, 'MICHAEL CHE'),
 (2, 'Michael McIntyre'),
 (3, 'Mike Epps'),
 (3, 'Neal Brennan'),
 (1, 'Ronny Chieng'),
 (3, 'Russell Peters'),
 (1, 'Sebastian Maniscalco'),
 (0, 'Trevor Noah'),
 (2, 'Vir Das'),
 (2, 'Whitney Cummings')]

In [12]:
corpus_noun_lda, result4 = train_LDA_model(corpus_noun_tim, corpus_noun_tf_idf)
corpus_noun_lda.print_topics()

[(0,
  '0.002*"shit" + 0.002*"phone" + 0.002*"mom" + 0.002*"im" + 0.001*"time" + 0.001*"woman" + 0.001*"people" + 0.001*"men" + 0.001*"money" + 0.001*"cat"'),
 (1,
  '0.002*"russian" + 0.002*"slipper" + 0.001*"white" + 0.001*"josep" + 0.001*"black" + 0.001*"keys" + 0.001*"friends" + 0.001*"mistakes" + 0.001*"people" + 0.001*"house"'),
 (2,
  '0.002*"ethiopians" + 0.002*"train" + 0.001*"asian" + 0.001*"baby" + 0.001*"im" + 0.001*"chinese" + 0.001*"porn" + 0.001*"ha" + 0.001*"doors" + 0.001*"thats"'),
 (3,
  '0.001*"accent" + 0.001*"im" + 0.001*"birthday" + 0.001*"sign" + 0.001*"diarrhea" + 0.001*"trevor" + 0.001*"pool" + 0.001*"hotel" + 0.001*"active" + 0.001*"hes"')]

In [13]:
result4

[(2, 'Amy Schumer'),
 (0, 'Arsenio Hall'),
 (2, 'Aziz Ansari'),
 (1, 'CHRIS ROCK'),
 (0, 'Chris Rock'),
 (0, 'Dave Chappelle'),
 (3, 'Hasan Minhaj'),
 (3, 'JACK WHITEHALL'),
 (1, 'JO KOY'),
 (2, 'Jimmy O. Yang'),
 (0, 'Jo Koy'),
 (0, 'Joe Rogan'),
 (0, 'Kevin Hart'),
 (0, 'MICHAEL CHE'),
 (0, 'Michael McIntyre'),
 (2, 'Mike Epps'),
 (1, 'Neal Brennan'),
 (2, 'Ronny Chieng'),
 (1, 'Russell Peters'),
 (0, 'Sebastian Maniscalco'),
 (3, 'Trevor Noah'),
 (0, 'Vir Das'),
 (0, 'Whitney Cummings')]

In [14]:
corpus_na_lda, result5 = train_LDA_model(corpus_na_dtm, corpus_na_cv)
corpus_na_lda.print_topics()

[(0,
  '0.013*"train" + 0.009*"money" + 0.009*"new" + 0.008*"hes" + 0.007*"kids" + 0.007*"baby" + 0.007*"year" + 0.007*"way" + 0.007*"chinese" + 0.007*"doors"'),
 (1,
  '0.011*"accent" + 0.011*"mom" + 0.010*"black" + 0.008*"phone" + 0.007*"fuck" + 0.007*"woman" + 0.007*"slipper" + 0.006*"white" + 0.006*"god" + 0.006*"yeah"'),
 (2,
  '0.019*"bitch" + 0.011*"harry" + 0.009*"gon" + 0.008*"okay" + 0.008*"car" + 0.008*"house" + 0.008*"ima" + 0.008*"russian" + 0.007*"face" + 0.007*"cause"'),
 (3,
  '0.010*"pool" + 0.008*"gon" + 0.008*"guy" + 0.008*"black" + 0.007*"years" + 0.007*"fuck" + 0.006*"ive" + 0.006*"men" + 0.006*"bandaids" + 0.006*"okay"')]

In [15]:
result5

[(0, 'Amy Schumer'),
 (1, 'Arsenio Hall'),
 (0, 'Aziz Ansari'),
 (1, 'CHRIS ROCK'),
 (3, 'Chris Rock'),
 (3, 'Dave Chappelle'),
 (0, 'Hasan Minhaj'),
 (2, 'JACK WHITEHALL'),
 (1, 'JO KOY'),
 (1, 'Jimmy O. Yang'),
 (1, 'Jo Koy'),
 (1, 'Joe Rogan'),
 (2, 'Kevin Hart'),
 (3, 'MICHAEL CHE'),
 (3, 'Michael McIntyre'),
 (2, 'Mike Epps'),
 (1, 'Neal Brennan'),
 (0, 'Ronny Chieng'),
 (2, 'Russell Peters'),
 (3, 'Sebastian Maniscalco'),
 (1, 'Trevor Noah'),
 (1, 'Vir Das'),
 (2, 'Whitney Cummings')]

In [16]:
corpus_na_lda, result6 = train_LDA_model(corpus_na_tim, corpus_na_tf_idf)
corpus_na_lda.print_topics()

[(0,
  '0.002*"phone" + 0.001*"accent" + 0.001*"birthday" + 0.001*"trevor" + 0.001*"im" + 0.001*"oh" + 0.001*"hes" + 0.001*"dad" + 0.001*"white" + 0.001*"black"'),
 (1,
  '0.002*"money" + 0.002*"pool" + 0.001*"train" + 0.001*"im" + 0.001*"sexy" + 0.001*"chinese" + 0.001*"sex" + 0.001*"asian" + 0.001*"people" + 0.001*"men"'),
 (2,
  '0.002*"mom" + 0.002*"russian" + 0.002*"slipper" + 0.001*"baby" + 0.001*"cat" + 0.001*"black" + 0.001*"shit" + 0.001*"civil" + 0.001*"rights" + 0.001*"josep"'),
 (3,
  '0.002*"im" + 0.001*"shit" + 0.001*"religion" + 0.001*"ethiopians" + 0.001*"white" + 0.001*"fun" + 0.001*"god" + 0.001*"black" + 0.001*"bitch" + 0.001*"woman"')]

In [17]:
result6

[(3, 'Amy Schumer'),
 (0, 'Arsenio Hall'),
 (2, 'Aziz Ansari'),
 (3, 'CHRIS ROCK'),
 (3, 'Chris Rock'),
 (2, 'Dave Chappelle'),
 (0, 'Hasan Minhaj'),
 (1, 'JACK WHITEHALL'),
 (2, 'JO KOY'),
 (1, 'Jimmy O. Yang'),
 (2, 'Jo Koy'),
 (2, 'Joe Rogan'),
 (3, 'Kevin Hart'),
 (2, 'MICHAEL CHE'),
 (1, 'Michael McIntyre'),
 (3, 'Mike Epps'),
 (3, 'Neal Brennan'),
 (1, 'Ronny Chieng'),
 (2, 'Russell Peters'),
 (1, 'Sebastian Maniscalco'),
 (0, 'Trevor Noah'),
 (3, 'Vir Das'),
 (1, 'Whitney Cummings')]

In [20]:
groups = []
group_df = pd.DataFrame(index=[0,1,2,3])
for i, result in enumerate([result1, result2, result3, result4, result5, result6]):
    group = {}
    for r in result:
        if r[0] in group:
            group[r[0]].append(r[1])
        else:
            group[r[0]] = [r[1]]
    group_df[f'result_{i+1}'] = group.values()
    groups.append(group)

In [23]:
group_df.to_csv('group.csv')

# Latent Semantic Analysis (LSA)

In [67]:
def train_LSA_model(data_matrix, vectorizer, num_topics=5):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lsi = LsiModel(gensim_corpus, num_topics=num_topics, id2word=id2word)
    
    corpus_transformed = lsi[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lsi, result

In [68]:
corpus_na_lsa, res = train_LSA_model(corpus_na_tim, corpus_na_tf_idf)
corpus_na_lsa.print_topics()

[(0,
  '0.241*"im" + 0.186*"shit" + 0.185*"people" + 0.149*"black" + 0.146*"youre" + 0.141*"thats" + 0.139*"time" + 0.139*"man" + 0.126*"white" + 0.119*"money"'),
 (1,
  '0.424*"black" + 0.323*"white" + 0.198*"friends" + 0.163*"accent" + 0.157*"civil" + 0.148*"people" + -0.144*"mom" + 0.143*"rights" + 0.118*"rules" + -0.115*"money"'),
 (2,
  '0.414*"mom" + 0.250*"slipper" + 0.184*"josep" + 0.175*"dad" + 0.171*"keys" + -0.161*"religion" + 0.150*"foot" + -0.135*"fellas" + -0.135*"phone" + -0.128*"motherfucker"'),
 (3,
  '0.289*"mom" + -0.224*"pool" + 0.208*"shit" + 0.166*"slipper" + -0.166*"im" + 0.138*"religion" + 0.122*"josep" + 0.122*"woman" + 0.122*"black" + -0.121*"sign"'),
 (4,
  '0.313*"train" + 0.259*"money" + -0.225*"ethiopians" + -0.201*"phone" + 0.190*"chinese" + 0.180*"asian" + 0.157*"doors" + -0.130*"slipper" + 0.125*"doctors" + 0.118*"sex"')]

In [69]:
res

[(0, 'Amy Schumer'),
 (0, 'Arsenio Hall'),
 (0, 'Aziz Ansari'),
 (0, 'CHRIS ROCK'),
 (0, 'Chris Rock'),
 (1, 'Dave Chappelle'),
 (0, 'Hasan Minhaj'),
 (0, 'JACK WHITEHALL'),
 (2, 'JO KOY'),
 (4, 'Jimmy O. Yang'),
 (2, 'Jo Koy'),
 (0, 'Joe Rogan'),
 (0, 'Kevin Hart'),
 (1, 'MICHAEL CHE'),
 (0, 'Michael McIntyre'),
 (0, 'Mike Epps'),
 (1, 'Neal Brennan'),
 (4, 'Ronny Chieng'),
 (0, 'Russell Peters'),
 (0, 'Sebastian Maniscalco'),
 (0, 'Trevor Noah'),
 (0, 'Vir Das'),
 (0, 'Whitney Cummings')]

# Coherence Value

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()