In [77]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# what I'm using to save the coherence and perplexity
import json

In [2]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [3]:
def cust_tokenizer(x):
    return x.split()

In [4]:
%%time
# reading in the data
df = pd.read_json("ext_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 10.7 s
Wall time: 10.7 s


In [5]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [6]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 9.48 s
Wall time: 9.48 s


In [7]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [8]:
# testing at various levels of K (10-100, steps of 10)

In [9]:
# 1st time just trying from 10-50 to see how that works

In [10]:
%%time
# initializing the LDA model with a k of 10
lda_k10 = LatentDirichletAllocation(
    n_components=10,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k10.fit(tf)

# computing the coherence of the topic model with K=10
k10_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k10.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

  from scipy.linalg.special_matrices import triu


CPU times: total: 1min 5s
Wall time: 5min 7s


In [36]:
%%time
# initializing the LDA model with a k of 15
lda_k15 = LatentDirichletAllocation(
    n_components=15,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k15.fit(tf)

# computing the coherence of the topic model with K=10
k15_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k15.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 8s
Wall time: 4min 56s


In [11]:
%%time
# initializing the LDA model with a k of 20
lda_k20 = LatentDirichletAllocation(
    n_components=20,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k20.fit(tf)

# computing the coherence of the topic model with K=20
k20_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k20.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 15s
Wall time: 4min 58s


In [12]:
%%time
# initializing the LDA model with a k of 30
lda_k30 = LatentDirichletAllocation(
    n_components=30,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k30.fit(tf)

# computing the coherence of the topic model with K=30
k30_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k30.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 14s
Wall time: 5min


In [13]:
%%time
# initializing the LDA model with a k of 40
lda_k40 = LatentDirichletAllocation(
    n_components=40,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k40.fit(tf)

# computing the coherence of the topic model with K=40
k40_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k40.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 15s
Wall time: 4min 54s


In [14]:
%%time
# initializing the LDA model with a k of 50
lda_k50 = LatentDirichletAllocation(
    n_components=50,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k50.fit(tf)

# computing the coherence of the topic model with K=40
k50_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k50.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 17s
Wall time: 4min 52s


In [16]:
%%time
# initializing the LDA model with a k of 60
lda_k60 = LatentDirichletAllocation(
    n_components=60,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k60.fit(tf)

# computing the coherence of the topic model with K=40
k60_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k60.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 19s
Wall time: 4min 40s


In [17]:
%%time
# initializing the LDA model with a k of 70
lda_k70 = LatentDirichletAllocation(
    n_components=70,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k70.fit(tf)

# computing the coherence of the topic model with K=40
k70_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k70.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 21s
Wall time: 4min 33s


In [18]:
%%time
# initializing the LDA model with a k of 80
lda_k80 = LatentDirichletAllocation(
    n_components=80,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k80.fit(tf)

# computing the coherence of the topic model with K=40
k80_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k80.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 23s
Wall time: 4min 34s


In [19]:
%%time
# initializing the LDA model with a k of 90
lda_k90 = LatentDirichletAllocation(
    n_components=90,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k90.fit(tf)

# computing the coherence of the topic model with K=40
k90_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k90.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 25s
Wall time: 4min 35s


In [20]:
%%time
# initializing the LDA model with a k of 100
lda_k100 = LatentDirichletAllocation(
    n_components=100,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k100.fit(tf)

# computing the coherence of the topic model with K=40
k100_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k100.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

CPU times: total: 1min 27s
Wall time: 4min 36s


In [24]:
print("Perplexity at K=10:", lda_k10.perplexity(tf))
print("Coherence at K=10:", k10_coh)
vis = pyLDAvis.sklearn.prepare(lda_k10, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=10: 631.4525628682134
Coherence at K=10: [0.22560086568448368, 0.18334230836921006, 0.24091832024912843, 0.30675242199961616, 0.3162184492753771, 0.27923251338410826, 0.30145025512209356, 0.3235284624543667, 0.2497124474033913, 0.29060148190202495]


  default_term_info = default_term_info.sort_values(


In [37]:
print("Perplexity at K=10:", lda_k15.perplexity(tf))
print("Coherence at K=10:", k15_coh)
vis = pyLDAvis.sklearn.prepare(lda_k15, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=10: 774.108110117537
Coherence at K=10: [0.2914679123993105, 0.24333955944529495, 0.34273984560934306, 0.33959699168407775, 0.34522764274122486, 0.2934061066346772, 0.35367127160678025, 0.3402903521778132, 0.25457348490710585, 0.3707112555717194, 0.28404928401222485, 0.3890754021377609, 0.332298630462361, 0.5209571565819519, 0.26234152239207587]


  default_term_info = default_term_info.sort_values(


In [26]:
print("Perplexity at K=20:", lda_k20.perplexity(tf))
print("Coherence at K=20:", k20_coh)
vis = pyLDAvis.sklearn.prepare(lda_k20, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=20: 887.5889038789646
Coherence at K=20: [0.3138553519886008, 0.29762388678573914, 0.3418469885338461, 0.34271708744643814, 0.32446392521781336, 0.26594236553847456, 0.32523122157312667, 0.39558836939983444, 0.2670268852794223, 0.3804918666097376, 0.31878985246694136, 0.3496089883127233, 0.29447293158286514, 0.5153701631007344, 0.3022776819234119, 0.2858197512919027, 0.2972009565292545, 0.2905172283334992, 0.34940387627397507, 0.31971041011086454]


  default_term_info = default_term_info.sort_values(


In [27]:
print("Perplexity at K=30:", lda_k30.perplexity(tf))
print("Coherence at K=30:", k30_coh)
vis = pyLDAvis.sklearn.prepare(lda_k30, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=30: 1169.3769557331866
Coherence at K=30: [0.28762658108567285, 0.2736007244332251, 0.35583801804682946, 0.343482539596291, 0.2867441764907145, 0.36806552767996614, 0.3303733588654318, 0.3846862128267877, 0.2781509304883308, 0.32916035128866045, 0.3576554573969238, 0.35514397314733875, 0.3035564362804253, 0.5080461083733276, 0.3120032112490919, 0.2930603716385204, 0.31105858853725793, 0.4922840230507555, 0.29241499762026585, 0.3223286295808255, 0.3996321425749184, 0.3186266533066705, 0.2924194442569161, 0.32378707936906886, 0.3022532095875311, 0.3378545070058089, 0.2779298818717707, 0.32050904176784156, 0.29071037300307523, 0.5137540201936097]


  default_term_info = default_term_info.sort_values(


In [28]:
print("Perplexity at K=40:", lda_k40.perplexity(tf))
print("Coherence at K=40:", k40_coh)
vis = pyLDAvis.sklearn.prepare(lda_k40, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=40: 1452.6971953649245
Coherence at K=40: [0.31064734238653824, 0.2693815347281546, 0.36459196529715293, 0.2945964833569271, 0.32630769660153863, 0.3830379833765179, 0.3721974750370406, 0.37186948789292407, 0.3094818723648586, 0.33881176608340313, 0.312318863533778, 0.32927364106725165, 0.3463341175227717, 0.5158146313018034, 0.3161849934327386, 0.33851722330157086, 0.2844940726970181, 0.4865695617821072, 0.2378672883812286, 0.3171659558187104, 0.3788855056970505, 0.29032036642405223, 0.31408148528827085, 0.3309349809870069, 0.3206059780511289, 0.28506551266818536, 0.28917112700842496, 0.32658224788541146, 0.3092035962518122, 0.5062843968829742, 0.40348347006156554, 0.3378051761052817, 0.25584966625457334, 0.3559836803741906, 0.33337462151548924, 0.307662630002581, 0.2940237873857193, 0.3767744073696791, 0.28862642479244854, 0.30399633570993817]


  default_term_info = default_term_info.sort_values(


In [29]:
print("Perplexity at K=50:", lda_k50.perplexity(tf))
print("Coherence at K=50:", k50_coh)
vis = pyLDAvis.sklearn.prepare(lda_k50, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=50: 1735.706772718406
Coherence at K=50: [0.3188913135582526, 0.2872851995077572, 0.35603498407207657, 0.3137232507293367, 0.3406629197341984, 0.3137157410140512, 0.3164513146073852, 0.3361649830338091, 0.30552751569542774, 0.3533820242478756, 0.32989618118179764, 0.3434386904227006, 0.31370252239832397, 0.4905833950811706, 0.25561911636611084, 0.3118610189292591, 0.31651404645956926, 0.5107067128781849, 0.24698613716015272, 0.3072041516886486, 0.37353667753963216, 0.2565001442351732, 0.3175015746371096, 0.3141432587611113, 0.31104049752962754, 0.23075002187020424, 0.27291551704701683, 0.3141652621304228, 0.28165620637762434, 0.5379712012210286, 0.3454885514898928, 0.3425227063688272, 0.22426019228099608, 0.3340903643532972, 0.26438190202732825, 0.30685661096268146, 0.2854957455591013, 0.3582620194075605, 0.2690328899619517, 0.2870586975041087, 0.2289045924774687, 0.35481707450689715, 0.2781314804394136, 0.3188266290411684, 0.31749670963945137, 0.26513683692854906, 0.31

  default_term_info = default_term_info.sort_values(


In [30]:
print("Perplexity at K=60:", lda_k60.perplexity(tf))
print("Coherence at K=60:", k60_coh)
vis = pyLDAvis.sklearn.prepare(lda_k60, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=60: 2062.168158870461
Coherence at K=60: [0.2901505330615127, 0.2704184210573187, 0.33942459740409414, 0.2898899670540163, 0.37211689156236, 0.30142197709459817, 0.3423638161456187, 0.3640792278209606, 0.30035547191243406, 0.3483016287678384, 0.31302752277360735, 0.3250713691859758, 0.32103145128111993, 0.46724465894678113, 0.32053896489702777, 0.3153081587267329, 0.2639841525350422, 0.5061942249829176, 0.24198460192568713, 0.32580126415415944, 0.3090592154521491, 0.289069254459568, 0.32136920538473135, 0.3324135242130718, 0.3228714490807626, 0.2695849514877185, 0.24398278682355912, 0.32183433972676717, 0.27473955371668657, 0.5039784714579956, 0.32415060290511427, 0.3011911649700814, 0.2199913004406288, 0.34030140010986965, 0.29232025299338205, 0.25555369442260173, 0.28498276719690513, 0.3582296977131996, 0.2959072204482188, 0.28474883033136716, 0.25623404216911255, 0.3630239164887933, 0.28597766944434855, 0.2989919304464653, 0.34506680204757073, 0.2765596500281794, 0.3

  default_term_info = default_term_info.sort_values(


In [32]:
print("Perplexity at K=70:", lda_k70.perplexity(tf))
print("Coherence at K=70:", k70_coh)
vis = pyLDAvis.sklearn.prepare(lda_k70, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=70: 2329.126307426612
Coherence at K=70: [0.2730698092451985, 0.27656203148927505, 0.35939796745409525, 0.32153573821431075, 0.3188953822858184, 0.294349408667519, 0.3375461464111962, 0.3123116320260751, 0.2737974494943224, 0.3442256196541608, 0.3146127176498424, 0.339000123321214, 0.3146120587074669, 0.2556743934053184, 0.2797896197494246, 0.33926558337001733, 0.286172815800439, 0.4797376125948804, 0.252259986951821, 0.3458063215773874, 0.3187535443716507, 0.2804962572504412, 0.31725732129100387, 0.3217530356383273, 0.30802824417484825, 0.23791031174657845, 0.27782427312128355, 0.3097256270987912, 0.2989198155288268, 0.5084408602092173, 0.33428378214953736, 0.30978233810297395, 0.23173013322247002, 0.2958234941616632, 0.3115859782211436, 0.2549846988216995, 0.321638328574766, 0.32331327424992573, 0.2842825145521738, 0.3080986141761563, 0.23726477602059406, 0.3868123143181831, 0.25089754104615103, 0.32309294346619816, 0.3370172856042083, 0.3120283192359667, 0.3390158504

  default_term_info = default_term_info.sort_values(


In [33]:
print("Perplexity at K=80:", lda_k80.perplexity(tf))
print("Coherence at K=80:", k80_coh)
vis = pyLDAvis.sklearn.prepare(lda_k80, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=80: 2628.8699205103458
Coherence at K=80: [0.26913440374552994, 0.25238666987641833, 0.30747420018775795, 0.32881330679289256, 0.3064234373094075, 0.3399608189094925, 0.3051055796483738, 0.34228456428717924, 0.2980046701529725, 0.34624242568421015, 0.28432977451988956, 0.3375586709715344, 0.319272023376348, 0.25992258554623543, 0.28229850773115167, 0.3506899952307707, 0.32142276210185117, 0.3076526228700293, 0.27979398745034284, 0.3184640344860349, 0.2739978365113296, 0.3021490058330924, 0.3155889487057616, 0.31620133262420935, 0.31723022302588316, 0.23623670076738112, 0.29269300365492495, 0.31044857617342003, 0.30143277202382424, 0.5154213147397717, 0.3840024975723303, 0.3056040112310871, 0.227813528974158, 0.3061814922629565, 0.2879971785906873, 0.26909236992253777, 0.36015133440962566, 0.31612215363501495, 0.3421247903147676, 0.2549873381769704, 0.2740274920966187, 0.3643242818915188, 0.289911732021557, 0.29355064354359117, 0.29123292208717894, 0.2848636197342388, 0.

  default_term_info = default_term_info.sort_values(


In [34]:
print("Perplexity at K=90:", lda_k90.perplexity(tf))
print("Coherence at K=90:", k90_coh)
vis = pyLDAvis.sklearn.prepare(lda_k90, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=90: 3008.3112771251253
Coherence at K=90: [0.2604904438103802, 0.2484079101773991, 0.28437522065000015, 0.29590555871877733, 0.33069930737387176, 0.3398609482377265, 0.32386498810295283, 0.323150118207861, 0.2805535852209446, 0.34177624072898655, 0.26972344042714036, 0.3301487771022344, 0.3335630927247534, 0.29747947481383086, 0.30626622071842974, 0.34006965927388244, 0.3160727386758243, 0.3195006008201432, 0.2980917650470206, 0.2913141598365804, 0.2722778909311604, 0.3198951094053431, 0.30154915875957194, 0.30879671545782317, 0.26236741912263095, 0.2595702556365877, 0.31811204295313844, 0.28435393454020147, 0.28650831790720516, 0.49965368547554084, 0.3165670824426756, 0.2941377269466569, 0.22656727662853, 0.3390773602221224, 0.29246849505138645, 0.25970189460328763, 0.3171288099885018, 0.3199717651684094, 0.30920356334582216, 0.26247222190928887, 0.28863101729521956, 0.36203363966778745, 0.27512199736255, 0.285746154409266, 0.30082418817837936, 0.27200388468232495, 0.3

  default_term_info = default_term_info.sort_values(


In [35]:
print("Perplexity at K=100:", lda_k100.perplexity(tf))
print("Coherence at K=100:", k100_coh)
vis = pyLDAvis.sklearn.prepare(lda_k100, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis

Perplexity at K=100: 3143.7455514356902
Coherence at K=100: [0.2602702385075107, 0.26083953428579126, 0.3036704434169414, 0.28122923316056947, 0.3234751673483036, 0.30022983111409135, 0.3650217637420303, 0.35257112837890153, 0.2616749155731667, 0.3662929648063272, 0.27250031273144526, 0.3566852957995512, 0.34626391259013783, 0.2982727112599552, 0.3074438221308992, 0.2999435412820093, 0.3107675987225365, 0.34816781860733037, 0.3077697155979512, 0.29466720600662194, 0.3261429303699236, 0.3269837257223093, 0.3215436462931792, 0.29130626734681925, 0.2517788271657365, 0.2859707960502327, 0.3127776624055154, 0.2815045341303028, 0.269331294292389, 0.4969901292873925, 0.38408395725836736, 0.32960632393985867, 0.24924219410865095, 0.3100693083792788, 0.3369893887813823, 0.2627011205121371, 0.3605613992338307, 0.2889239366870883, 0.2809482230432961, 0.27593009021357345, 0.23257630662181827, 0.37794058664172026, 0.28301911632101284, 0.2932228772542285, 0.2946186340017069, 0.25907909137971663, 0.3

  default_term_info = default_term_info.sort_values(


#### Saving Everything

In [51]:
# going ahead and saving the models created here (even though we're likely tossing them)

# here I'm making the lists to hold the lda models themselves and the file names we'll be using to save them
lda_list = [lda_k10, lda_k15, lda_k20, lda_k30, lda_k40, lda_k50, lda_k60, lda_k70, lda_k80, lda_k90, lda_k100]
lda_file_names = ["lda_k10.pk", "lda_k15.pk", "lda_k20.pk", "lda_k30.pk", "lda_k40.pk", "lda_k50.pk", "lda_k60.pk"
                  , "lda_k70.pk", "lda_k80.pk", "lda_k90.pk", "lda_k100.pk"]
lda_file_names = ["ext_act_1st_run_" + x for x in lda_file_names] # list comp to adjust names further

In [53]:
# looping through the models and filenames -> zip allows us to combine our iteration in one step (which is nice)
for lda, filename in zip(lda_list, lda_file_names):
    pickle.dump(lda, open(filename, "wb")) # dumping with pickle, the lda with the filename (with write binary as the option)

In [62]:
# making the lists for filenames
vis_file_names = [x.replace(".pk","_pyldavis.html") for x in lda_file_names] # list comp to adjust names for the vis saving

In [63]:
%%time
# running through the lda models and saving the visualizations
for lda, filename in zip(lda_list, vis_file_names):
    vis = pyLDAvis.sklearn.prepare(lda, tf, tfidf_vectorizer)
    pyLDAvis.save_html(vis, filename)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


CPU times: total: 3min 10s
Wall time: 8min 18s


In [68]:
# making alist of coherence variables
coh_list = [k10_coh, k15_coh, k20_coh, k30_coh, k40_coh, k50_coh, k60_coh, k70_coh, k80_coh, k90_coh, k100_coh]
k_list = ["k10", "k15", "k20", "k30", "k40", "k50", "k60", "k70", "k80", "k90", "k100"]

In [74]:
# saving the perplexity scores & the coherence scores
coh_perp_dict = {}
for coh, lda, k in zip(coh_list, lda_list, k_list):
    coh_perp_dict.setdefault(k, {})
    coh_perp_dict[k].setdefault("coherence", coh)
    coh_perp_dict[k].setdefault("peroplexity", lda.perplexity(tf))

In [78]:
# saving the the test run coherence and perplexity for each of the models
with open('test_run_coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)