In [5]:
import gensim
import gensim.corpora as corpora
import importlib
import logging
import nltk
import os
import pandas as pd
import pickle
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import transform
import warnings

from collections import defaultdict
from datetime import datetime
from gensim.models import CoherenceModel, HdpModel, LsiModel, TfidfModel
from gensim.models.ldamodel import LdaModel
from glob import glob
from load import load_filters, load_data_from_json, load_data_from_psql
from nltk.corpus import stopwords
from pprint import pprint
from tqdm import tqdm_notebook

importlib.reload(transform)

LOGLEVEL = logging.ERROR

logging.basicConfig(level=LOGLEVEL)
warnings.filterwarnings("ignore", category=DeprecationWarning)

tqdm_notebook().pandas()

if 'stopwords' not in os.listdir(nltk.data.find("corpora")):
    nltk.download('stopwords')

STOP_WORDS = stopwords.words('englisgit h')
STOP_WORDS.extend(['gt', 'ymy', 'hi', 'get', 'thi', 'http', 
                   'ha', 'amp', 'nbsp', 'amp_nbsp', 'http', 'https', 'www', 
                   'com', 'message_moderators', 'like', 'also', 'regardless'])
STOP_WORDS.extend(load_filters("body"))
STOP_WORDS = set(STOP_WORDS)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

INFO:root:Blacklsits:data/filtering/body/custom_stopwords.txt





In [3]:
def run(tag, subreddit, num_topics_list=range(3, 13), ngrams=3, use_tfidf=False):
    root_path = "output/"+tag
    os.mkdir(root_path)
    
    # Load data
    observations = load_data_from_json(["AskWomen"], log_level=LOGLEVEL)
    #observations = load_data_from_psql(["AskWomen"], table_name="preprocessed_posts", log_level=LOGLEVEL)
    #observations = observations.sample(1000)
    print(str(observations.head()))
    logging.debug(observations.head())
        
    # Transform data
    observations = transform.transform(observations, ngrams=ngrams, threshold=50, stop_words=STOP_WORDS)
    tokenized_data = list(observations['process_body'].values)
    id2word = corpora.Dictionary(tokenized_data)

    texts = tokenized_data
    corpus = [id2word.doc2bow(text) for text in texts]
    
    # archive texts
    observations[['body', 'process_body']].to_csv(root_path+'/docs.csv', index=False)
                    
    # Word frequency
    word_freq = get_word_freq(id2word, corpus)
    with open(root_path+"/word_frequency.txt", "w") as outfile:
        outfile.write('\n'.join([str(w) for w in word_freq]))
    
    for num_topics in num_topics_list:
        # Make dir
        path = root_path+"/"+str(num_topics)
        os.mkdir(path)
        
        # Model
        lda_model = get_model(corpus, id2word, num_topics, use_tfidf=use_tfidf, use_lsi=False)

        with open(path+"/model.pkl", "wb") as pfile:
            pickle.dump(lda_model, pfile)
            
        # Topic weights
        pprint(lda_model.print_topics())
        with open(path+"/topics.txt", "w") as outfile:
            outfile.write('\n'.join([str(i) for i in lda_model.show_topics()]))
        
        # Save coherence model for metrics
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts,
            dictionary=id2word, coherence='c_v')
        with open(path+"/coherence_model.pkl", "wb") as pfile:
            pickle.dump(coherence_model_lda, pfile)
        
        # Viz
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
        with open(path+"/results.html", "w") as outfile:
            pyLDAvis.save_html(data=vis, fileobj=outfile)
            

######### Model
def get_model(corpus, id2word, num_topics, use_tfidf=False, use_lsi=False, use_hdp=False):

    if use_tfidf:
        tfidf_model = TfidfModel(corpus, id2word=id2word)
        corpus = tfidf_model[corpus]
        
    if use_hdp:
        hdp_model = HdpModel(corpus=corpus, id2word=id2word, chunksize=500,
                            max_chunks=num_topics)
        return hdp_model
    
    if use_lsi:
        lsi_model = LsiModel(corpus=corpus, id2word=id2word, chunksize=1000,
                            num_topics=num_topics)
        return lsi_model

    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics,
                         update_every=1, chunksize=1000, passes=5, alpha='auto',
                         per_word_topics=True, iterations=35
    )
    return lda_model

######### Debug
def get_word_freq(id2word, corpus, limit=None):
    count = defaultdict(int)
    for cp in corpus:
        for tup in cp:
            count[id2word[tup[0]]] += tup[1]
    if limit==None:
        limit=len(count)
    return sorted(count.items(), reverse=True, key=lambda x: x[1])[:limit]

######### Metrics
def save_coherence_score(topics, tag):
    root_path = "output/{}/".format(tag)
    for i in topics:
        path = root_path+str(i)
        coherence_model = None
        with open(path + "/coherence_model.pkl", "rb") as cfile:
            coherence_model = pickle.load(cfile)
            coherence_score = str(coherence_model.get_coherence())
            print(str(i) + " Topics: " + coherence_score)
            with open(path+"/coherence_score.txt", "w") as outfile:
                outfile.write(coherence_score)


In [4]:
ngrams=3
topics = [35, 37, 40, 43, 45]
now = datetime.now().strftime("%Y_%m_%d_%H%M")
tag = "AskWomen_test_"+now

run(tag, "AskWomen", num_topics_list=topics, ngrams=ngrams, use_tfidf=True)

INFO:root:Observations: 117438 rows, 18 columns
INFO:root:Blacklsits:data/filtering/author/reddit_bots.txt data/filtering/author/suspicious_authors.txt
INFO:root:Removing bots and suspicious authors
INFO:root:Preprocessing


              author author_flair_css_class author_flair_text  \
45928       reagan92                 female                 ♀   
11860      [deleted]                   None              None   
105642  Ray_adverb12                 female                 ♀   
56764     pinkliquor                   None              None   
16698     Shannieann                 female                 ♀   

                                                     body  can_gild  \
45928   Your comment has been removed because:\n\nDisr...      True   
11860                                           [deleted]      True   
105642                              She's not a princess!      True   
56764   Oh my god this is my ex. I would never know if...      True   
16698   1.It's not being a jerk thats what reviews are...      True   

        controversiality  created_utc distinguished edited  gilded       id  \
45928                  0   1497302674     moderator  False       0  ditbk9y   
11860                  0

HBox(children=(IntProgress(value=0, max=902), HTML(value='')))

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:collected 19333 word types from a corpus of 17522 words (unigram + bigrams) and 902 sentences
INFO:gensim.models.phrases:using 19333 counts as vocab in Phrases<0 vocab, min_count=5, threshold=50, max_vocab_size=40000000>
INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types





INFO:gensim.models.phrases:collected 19343 word types from a corpus of 17255 words (unigram + bigrams) and 902 sentences
INFO:gensim.models.phrases:using 19343 counts as vocab in Phrases<0 vocab, min_count=5, threshold=50, max_vocab_size=40000000>
INFO:gensim.models.phrases:source_vocab length 19333
INFO:gensim.models.phrases:Phraser built with 23 23 phrasegrams
INFO:gensim.models.phrases:source_vocab length 19343
INFO:gensim.models.phrases:Phraser built with 37 37 phrasegrams
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  observations['process_body'] = _make_ngrams_(data_words_nostops, ngrams, threshold)
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(4337 unique tokens: ['act_upon_askwomen', 'askwomen',

INFO:gensim.models.ldamodel:optimized alpha [0.02647565, 0.026938368, 0.02645284, 0.026436524, 0.026127674, 0.026341897, 0.026979769, 0.02794347, 0.027108626, 0.026990874, 0.029209135, 0.028661996, 0.027051458, 0.026999721, 0.027428072, 0.02700907, 0.02816415, 0.028328005, 0.026801165, 0.025779862, 0.027213672, 0.02701626, 0.026243683, 0.026273087, 0.02741298, 0.026945718, 0.026574008, 0.027838672, 0.028155519, 0.027342347, 0.026642235, 0.026384419, 0.027116362, 0.026536724, 0.027688876]
INFO:gensim.models.ldamodel:topic #19 (0.026): 0.007*"love" + 0.006*"cheese" + 0.005*"chickens" + 0.005*"spermy" + 0.005*"make" + 0.004*"umm" + 0.004*"unladylike" + 0.004*"eyebrows" + 0.004*"chicken" + 0.004*"drive"
INFO:gensim.models.ldamodel:topic #4 (0.026): 0.005*"eh" + 0.004*"much" + 0.004*"season" + 0.004*"saliva" + 0.003*"owe" + 0.003*"ever" + 0.003*"risk" + 0.003*"care" + 0.003*"cake" + 0.003*"dye"
INFO:gensim.models.ldamodel:topic #17 (0.028): 0.005*"name" + 0.005*"sexist" + 0.004*"theyre" + 0

[(19,
  '0.007*"love" + 0.006*"cheese" + 0.005*"chickens" + 0.005*"spermy" + '
  '0.005*"make" + 0.004*"unladylike" + 0.004*"umm" + 0.004*"eyebrows" + '
  '0.004*"chicken" + 0.004*"drive"'),
 (4,
  '0.005*"eh" + 0.004*"much" + 0.004*"season" + 0.004*"saliva" + 0.003*"owe" + '
  '0.003*"ever" + 0.003*"risk" + 0.003*"care" + 0.003*"dye" + 0.003*"cake"'),
 (23,
  '0.004*"legal" + 0.004*"musk" + 0.004*"cultural" + 0.004*"torpedo" + '
  '0.003*"natural" + 0.003*"target" + 0.003*"swan" + 0.003*"trumpet" + '
  '0.003*"hardcore" + 0.003*"roller"'),
 (22,
  '0.005*"city" + 0.005*"weird" + 0.004*"doesnt" + 0.004*"feel" + 0.004*"show" '
  '+ 0.004*"mostly" + 0.003*"wear" + 0.003*"hobbies" + 0.003*"volunteer" + '
  '0.003*"single"'),
 (5,
  '0.006*"anymore" + 0.006*"doomsday" + 0.005*"fat" + 0.004*"talk" + '
  '0.004*"adults" + 0.004*"embrace" + 0.004*"school" + 0.004*"vacation" + '
  '0.004*"serena" + 0.004*"joy"'),
 (31,
  '0.004*"worth" + 0.004*"awful" + 0.004*"mildly" + 0.004*"cool" + '
  '0.0

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 902 documents and 4336 features (15181 matrix non-zeros)
INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028, 0.027027028]
INFO:gensim.models.ldamodel:using symmetric eta at 0.0270

INFO:gensim.models.ldamodel:topic #14 (0.025): 0.004*"time" + 0.004*"instead" + 0.004*"anywhere" + 0.004*"event" + 0.004*"see" + 0.004*"disable" + 0.004*"rather" + 0.003*"bum" + 0.003*"good" + 0.003*"one"
INFO:gensim.models.ldamodel:topic #31 (0.027): 0.006*"permit_question_moderator_action" + 0.006*"click_reddit_compose_askwomen" + 0.005*"work" + 0.005*"school" + 0.004*"understand_reason_give_mod" + 0.004*"reddit_reddit_wiki" + 0.004*"askwomen_askwomen" + 0.004*"act_upon_askwomen" + 0.004*"askwomen_reddit" + 0.004*"burpees"
INFO:gensim.models.ldamodel:topic #21 (0.027): 0.006*"treat" + 0.006*"family" + 0.005*"time" + 0.005*"make" + 0.004*"us" + 0.004*"things" + 0.004*"one" + 0.004*"im" + 0.004*"straight" + 0.004*"feel"
INFO:gensim.models.ldamodel:topic #5 (0.027): 0.006*"check" + 0.005*"one" + 0.005*"friends" + 0.004*"hear" + 0.004*"time" + 0.004*"ive" + 0.004*"make" + 0.004*"manipulation" + 0.004*"bar" + 0.004*"search"
INFO:gensim.models.ldamodel:topic diff=0.059127, rho=0.447214
INF

[(25,
  '0.005*"eh" + 0.004*"care" + 0.004*"much" + 0.004*"practice" + 0.003*"dog" + '
  '0.003*"resent" + 0.003*"unfortunate" + 0.003*"cynical" + 0.003*"review" + '
  '0.003*"health"'),
 (14,
  '0.004*"time" + 0.004*"instead" + 0.004*"anywhere" + 0.004*"event" + '
  '0.004*"see" + 0.004*"disable" + 0.004*"rather" + 0.003*"bum" + 0.003*"good" '
  '+ 0.003*"one"'),
 (11,
  '0.007*"feel" + 0.006*"thank" + 0.005*"weird" + 0.005*"right" + 0.004*"much" '
  '+ 0.004*"bad" + 0.004*"dna" + 0.004*"good" + 0.004*"gals" + 0.004*"pal"'),
 (6,
  '0.005*"serious" + 0.005*"bubble" + 0.005*"obviously" + 0.004*"yea" + '
  '0.004*"sexist" + 0.004*"talk" + 0.004*"dudes" + 0.004*"care" + '
  '0.004*"anything" + 0.003*"hair"'),
 (32,
  '0.006*"boob" + 0.006*"dick" + 0.005*"aaw" + 0.004*"name" + 0.004*"saliva" + '
  '0.004*"agree" + 0.004*"mine" + 0.004*"shave" + 0.004*"dance" + 0.004*"fly"'),
 (28,
  '0.004*"annoy" + 0.004*"lab" + 0.004*"lol" + 0.004*"la" + 0.004*"certain" + '
  '0.004*"dive" + 0.003*"dist

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 902 documents and 4336 features (15181 matrix non-zeros)
INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025]
INFO:gensim.models.ldamodel:using symmetric eta at 0.025
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 40 topics, 5 passes over the supplied corpus of 902 documents, updating model once every 902 documents, evaluating perplexity every 902 documents, iterating 35x with a convergence threshold of 0.001000
INFO:gensim.models.

INFO:gensim.models.ldamodel:topic #27 (0.025): 0.025*"askwomen" + 0.021*"reddit" + 0.021*"askwomen_askwomen" + 0.021*"askwomen_reddit" + 0.021*"understand_reason_give_mod" + 0.021*"act_upon_askwomen" + 0.021*"reddit_reddit_wiki" + 0.017*"reddit_compose_askwomen" + 0.017*"question_moderator_action_click" + 0.009*"question"
INFO:gensim.models.ldamodel:topic #30 (0.025): 0.005*"happen" + 0.005*"cheat" + 0.005*"leave" + 0.004*"friendship" + 0.004*"friends" + 0.004*"person" + 0.004*"almost" + 0.004*"notice" + 0.004*"im" + 0.004*"always"
INFO:gensim.models.ldamodel:topic diff=0.070852, rho=0.447214
INFO:gensim.models.ldamodel:-15.394 per-word bound, 43045.6 perplexity estimate based on a held-out corpus of 902 documents with 3120 words
INFO:gensim.models.ldamodel:PROGRESS: pass 4, at document #902/902
INFO:gensim.models.ldamodel:optimized alpha [0.02436665, 0.023554089, 0.023580572, 0.024586584, 0.024990175, 0.024440972, 0.023337279, 0.025100194, 0.024188973, 0.02390967, 0.02302057, 0.022764

[(39,
  '0.005*"cloud" + 0.004*"point" + 0.004*"flower" + 0.004*"dna" + 0.004*"love" '
  '+ 0.004*"queue" + 0.004*"pronounce" + 0.004*"booty" + 0.004*"im" + '
  '0.003*"rip"'),
 (11,
  '0.006*"sith" + 0.006*"dog" + 0.005*"bucket" + 0.005*"perfect" + '
  '0.005*"treat" + 0.004*"wear" + 0.004*"thank" + 0.004*"find" + '
  '0.003*"brando" + 0.003*"cool"'),
 (18,
  '0.005*"rosaline" + 0.005*"awful" + 0.005*"scrub" + 0.004*"monster" + '
  '0.004*"coasters" + 0.004*"work" + 0.004*"fantastic" + 0.004*"terrible" + '
  '0.004*"turn" + 0.004*"finger"'),
 (35,
  '0.005*"work" + 0.005*"puppy" + 0.005*"lab" + 0.005*"one" + 0.004*"gay" + '
  '0.004*"certain" + 0.004*"netflix" + 0.004*"vacation" + 0.004*"bleach" + '
  '0.004*"cringey"'),
 (36,
  '0.005*"yea" + 0.005*"collection" + 0.005*"sex" + 0.004*"dvd" + '
  '0.004*"asshole" + 0.004*"time" + 0.004*"game" + 0.004*"first" + '
  '0.003*"comedy" + 0.003*"care"'),
 (10,
  '0.005*"together" + 0.005*"great" + 0.004*"price" + 0.004*"red" + '
  '0.004*"yea

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 902 documents and 4336 features (15181 matrix non-zeros)
INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814, 0.023255814]
INFO:gensim.models.ldamodel:using symmetric eta at 0.023255813953488372
INFO:gensim.models.ldamodel:using serial LDA version on this no

INFO:gensim.models.ldamodel:topic #7 (0.022): 0.007*"dealbreaker" + 0.005*"man" + 0.004*"agree" + 0.004*"certain" + 0.004*"mine" + 0.004*"honest" + 0.004*"nothing" + 0.004*"scandinavian" + 0.004*"promotional" + 0.004*"im"
INFO:gensim.models.ldamodel:topic #28 (0.023): 0.010*"askwomen" + 0.008*"act_upon_askwomen" + 0.008*"understand_reason_give_mod" + 0.008*"reddit_reddit_wiki" + 0.008*"askwomen_askwomen" + 0.008*"askwomen_reddit" + 0.007*"reddit" + 0.007*"question" + 0.007*"answer" + 0.007*"question_moderator_action_click"
INFO:gensim.models.ldamodel:topic #9 (0.023): 0.005*"base" + 0.004*"big" + 0.004*"sense" + 0.004*"plant" + 0.004*"pony" + 0.004*"line" + 0.004*"youre" + 0.004*"rewatching" + 0.004*"close" + 0.003*"time"
INFO:gensim.models.ldamodel:topic #27 (0.024): 0.010*"love" + 0.007*"name" + 0.007*"time" + 0.007*"remember" + 0.006*"years" + 0.005*"life" + 0.005*"cant" + 0.005*"one" + 0.005*"hide" + 0.004*"call"
INFO:gensim.models.ldamodel:topic diff=0.068149, rho=0.447214
INFO:ge

[(6,
  '0.007*"dick" + 0.006*"mildly" + 0.005*"peanut" + 0.005*"saliva" + '
  '0.004*"butter" + 0.004*"miss" + 0.004*"cheater" + 0.003*"feet" + '
  '0.003*"put" + 0.003*"social"'),
 (7,
  '0.007*"dealbreaker" + 0.005*"man" + 0.004*"agree" + 0.004*"certain" + '
  '0.004*"mine" + 0.004*"honest" + 0.004*"nothing" + 0.004*"scandinavian" + '
  '0.004*"promotional" + 0.004*"theres"'),
 (16,
  '0.007*"sith" + 0.006*"flower" + 0.006*"hm" + 0.005*"weakest" + '
  '0.005*"imgur" + 0.005*"dive" + 0.004*"shortness" + 0.004*"unattractive" + '
  '0.004*"ginger" + 0.004*"oriental"'),
 (17,
  '0.007*"ignore" + 0.006*"dog" + 0.006*"hat" + 0.005*"embrace" + '
  '0.005*"tarot" + 0.005*"vacation" + 0.004*"bath" + 0.004*"everyone" + '
  '0.004*"im" + 0.004*"rip"'),
 (12,
  '0.007*"purple" + 0.007*"ask" + 0.006*"ouch" + 0.006*"recipe" + '
  '0.005*"question" + 0.005*"hoodie" + 0.005*"oral" + 0.004*"thread" + '
  '0.004*"want" + 0.004*"bartender"'),
 (34,
  '0.006*"princess" + 0.005*"zookeepers" + 0.005*"mate

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 902 documents and 4336 features (15181 matrix non-zeros)
INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223, 0.022222223]
INFO:gensim.models.ldamodel:using symmetric eta at 0.022222222222222223
INFO:gensim.models.ldamodel:using ser

INFO:gensim.models.ldamodel:topic #38 (0.021): 0.006*"pony" + 0.005*"make" + 0.005*"chickens" + 0.005*"family" + 0.005*"us" + 0.005*"read" + 0.004*"embarrass" + 0.004*"eat" + 0.004*"men" + 0.004*"chicken"
INFO:gensim.models.ldamodel:topic #30 (0.022): 0.005*"big" + 0.005*"plant" + 0.005*"base" + 0.005*"username" + 0.005*"sock" + 0.004*"care" + 0.004*"plan" + 0.004*"butter" + 0.004*"dick" + 0.004*"breast"
INFO:gensim.models.ldamodel:topic #25 (0.022): 0.005*"day" + 0.005*"even" + 0.004*"listen" + 0.004*"miss" + 0.004*"another" + 0.004*"coffee" + 0.004*"close" + 0.004*"rewatching" + 0.004*"mildly" + 0.003*"isnt"
INFO:gensim.models.ldamodel:topic #8 (0.026): 0.012*"work" + 0.009*"time" + 0.009*"id" + 0.008*"make" + 0.008*"want" + 0.008*"never" + 0.008*"back" + 0.007*"still" + 0.007*"take" + 0.007*"even"
INFO:gensim.models.ldamodel:topic diff=0.085871, rho=0.447214
INFO:gensim.models.ldamodel:-15.729 per-word bound, 54304.4 perplexity estimate based on a held-out corpus of 902 documents wi

[(42,
  '0.006*"cholera" + 0.006*"overrate" + 0.006*"price" + 0.005*"treat" + '
  '0.005*"drink" + 0.005*"ye" + 0.005*"kinsey" + 0.004*"career" + '
  '0.004*"distance" + 0.004*"scale"'),
 (28,
  '0.007*"eat" + 0.005*"design" + 0.005*"beautiful" + 0.005*"umm" + '
  '0.005*"unladylike" + 0.004*"comfortable" + 0.004*"gush" + 0.004*"wonderful" '
  '+ 0.004*"assume" + 0.003*"effort"'),
 (6,
  '0.007*"dealbreaker" + 0.007*"current" + 0.005*"recipe" + 0.005*"thread" + '
  '0.004*"bump" + 0.004*"dip" + 0.003*"crunchy" + 0.003*"apple" + '
  '0.003*"fodder" + 0.003*"kcdx"'),
 (38,
  '0.006*"pony" + 0.005*"make" + 0.005*"chickens" + 0.005*"family" + '
  '0.005*"us" + 0.004*"embarrass" + 0.004*"read" + 0.004*"chicken" + '
  '0.004*"eat" + 0.004*"men"'),
 (44,
  '0.008*"purple" + 0.008*"course" + 0.008*"cowgirl" + 0.006*"la" + '
  '0.005*"exaggeration" + 0.005*"carpet" + 0.005*"certain" + 0.004*"talkative" '
  '+ 0.004*"barely" + 0.004*"preset"'),
 (39,
  '0.007*"manipulation" + 0.006*"every" + 0.0

In [46]:
save_coherence_score(topics, tag)
# [0.70, 0.67, 0.65, 0.64, 0.58, 0.52, 0.55, 0.51, 0.48, 0.49]

35 Topics: 0.4743048072504317
37 Topics: 0.47512955371823873
40 Topics: 0.48657291510407213
43 Topics: 0.48799713097491143
45 Topics: 0.49126764646996074
