In [1]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# importing json to serialize the thingy
import json

# import timing library
import time

In [2]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [3]:
def cust_tokenizer(x):
    return x.split()

In [4]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [6]:
# reading in the model
lda_k100 = pickle.load(open("min_act_lda_k100.pk", 'rb'))

In [7]:
%%time
# reading in the data
df = pd.read_json("min_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 3min 3s
Wall time: 3min 6s


In [8]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 2min 28s
Wall time: 2min 28s


In [9]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [16]:
%%time
# computing the coherence of the topic model with K=100
k100_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k100.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab


  from scipy.linalg.special_matrices import triu


CPU times: total: 4min 31s
Wall time: 6min 21s


In [17]:
with open("coh_perp.json") as f_in:
    coh_perp_dict = json.load(f_in)

In [18]:
# adding the coherence to the json file
# printing coherence and adding it to the dict
print("Coherence at K=100:", k100_coh)
coh_perp_dict["k100"].setdefault("coherence", k100_coh)


Coherence at K=100: [0.4882727401991226, 0.5332300526438136, 0.5113672902744522, 0.4832750920173118, 0.47971900652122407, 0.4903701882706744, 0.5310071536690575, 0.5701261003589326, 0.5361830886844807, 0.43442710127638334, 0.49738229021626995, 0.4931095460675186, 0.5292690419733748, 0.49937785337725416, 0.47407855918706054, 0.5212633475733971, 0.43826174173472915, 0.515180844903637, 0.4412932638544551, 0.4542853660175258, 0.5287637250690691, 0.4884525749141031, 0.4385969026751829, 0.5391703494041942, 0.5188878802166248, 0.4130327660759948, 0.5443414918171235, 0.4700444792117958, 0.44828466239007003, 0.43256071684015496, 0.43187636930880613, 0.5022712819616462, 0.48097699347536266, 0.5638121793016574, 0.4729011030276271, 0.4896228632148475, 0.5071928353581224, 0.540773730327407, 0.5203643661248433, 0.49507102019252447, 0.4741890122515775, 0.450221542785956, 0.5331021223962427, 0.5093219267092511, 0.5029006387676993, 0.5518212712475032, 0.5055823429678823, 0.5136054444534779, 0.473382739

[0.4882727401991226,
 0.5332300526438136,
 0.5113672902744522,
 0.4832750920173118,
 0.47971900652122407,
 0.4903701882706744,
 0.5310071536690575,
 0.5701261003589326,
 0.5361830886844807,
 0.43442710127638334,
 0.49738229021626995,
 0.4931095460675186,
 0.5292690419733748,
 0.49937785337725416,
 0.47407855918706054,
 0.5212633475733971,
 0.43826174173472915,
 0.515180844903637,
 0.4412932638544551,
 0.4542853660175258,
 0.5287637250690691,
 0.4884525749141031,
 0.4385969026751829,
 0.5391703494041942,
 0.5188878802166248,
 0.4130327660759948,
 0.5443414918171235,
 0.4700444792117958,
 0.44828466239007003,
 0.43256071684015496,
 0.43187636930880613,
 0.5022712819616462,
 0.48097699347536266,
 0.5638121793016574,
 0.4729011030276271,
 0.4896228632148475,
 0.5071928353581224,
 0.540773730327407,
 0.5203643661248433,
 0.49507102019252447,
 0.4741890122515775,
 0.450221542785956,
 0.5331021223962427,
 0.5093219267092511,
 0.5029006387676993,
 0.5518212712475032,
 0.5055823429678823,
 0.51

In [19]:
coh_perp_dict["k100"]

{'perplexity': 3382.639084473857,
 'coherence': [0.4882727401991226,
  0.5332300526438136,
  0.5113672902744522,
  0.4832750920173118,
  0.47971900652122407,
  0.4903701882706744,
  0.5310071536690575,
  0.5701261003589326,
  0.5361830886844807,
  0.43442710127638334,
  0.49738229021626995,
  0.4931095460675186,
  0.5292690419733748,
  0.49937785337725416,
  0.47407855918706054,
  0.5212633475733971,
  0.43826174173472915,
  0.515180844903637,
  0.4412932638544551,
  0.4542853660175258,
  0.5287637250690691,
  0.4884525749141031,
  0.4385969026751829,
  0.5391703494041942,
  0.5188878802166248,
  0.4130327660759948,
  0.5443414918171235,
  0.4700444792117958,
  0.44828466239007003,
  0.43256071684015496,
  0.43187636930880613,
  0.5022712819616462,
  0.48097699347536266,
  0.5638121793016574,
  0.4729011030276271,
  0.4896228632148475,
  0.5071928353581224,
  0.540773730327407,
  0.5203643661248433,
  0.49507102019252447,
  0.4741890122515775,
  0.450221542785956,
  0.5331021223962427,

In [20]:
# serializing the dict
# saving the the test run coherence and perplexity for each of the models
with open('coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)

In [10]:
%%time
# creating the viz & saving the viz

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k100, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak100_pyldavis.html")

  default_term_info = default_term_info.sort_values(


CPU times: total: 5h 42min 21s
Wall time: 8h 38min 58s
