# Build Dictionary & Corpus

In [1]:
import pickle
import random

import gensim
import pandas as pd
from gensim.models.coherencemodel import CoherenceModel
from IPython.display import HTML, Markdown, display
from tabulate import tabulate
from tqdm import tqdm_notebook

tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






In [45]:
business_name = "In-N-Out Burger"
df = pd.read_parquet("data/" + business_name + '_reviews.parquet')
tokens = df.text.progress_apply(gensim.parsing.preprocess_string)
tokens = list(tokens)

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))




In [46]:
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=None)

corpus = [0] * len(tokens)

for i in tqdm_notebook(range(len(tokens))):
    corpus[i] = dictionary.doc2bow(tokens[i])

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))




# Model Building & Evaluation

In [47]:
list_num_topics = [3, 4, 5, 6, 7, 8]
list_passes = [15, 20, 25]
done = set()

results = []
for i in range(len(list_passes)):
    results.append([" "] * len(list_num_topics))

In [48]:
while len(done) < len(list_num_topics) * len(list_passes) * 0.8:
    topics_index = random.choice(range(len(list_num_topics)))
    passes_index = random.choice(range(len(list_passes)))
    num_topics = list_num_topics[topics_index]
    passes = list_passes[passes_index]
    tup = (num_topics, passes)
    if tup in done:
        continue
    coherence = 0
    num_tries = 1
    for i in range(num_tries):
        lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                            id2word=dictionary,
                                                            num_topics=num_topics, 
                                                            random_state=i,
                                                            chunksize=500,
                                                            passes=passes,
                                                            workers=7,
                                                            per_word_topics=True)
        cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v')
        coherence += cm.get_coherence()
    done.add(tup)
    coherence = round(coherence / num_tries, 5)
    print("num topics:", num_topics, ", passes:", passes, ":", coherence)
    results[passes_index][topics_index] = coherence

num topics: 6 , passes: 25 : 0.48777
num topics: 4 , passes: 25 : 0.48422
num topics: 8 , passes: 20 : 0.47927
num topics: 3 , passes: 25 : 0.43387
num topics: 4 , passes: 15 : 0.43359
num topics: 8 , passes: 25 : 0.49326
num topics: 5 , passes: 20 : 0.49687
num topics: 6 , passes: 20 : 0.48972
num topics: 4 , passes: 20 : 0.46966
num topics: 6 , passes: 15 : 0.48026
num topics: 8 , passes: 15 : 0.47088
num topics: 5 , passes: 25 : 0.50537
num topics: 7 , passes: 20 : 0.47495
num topics: 7 , passes: 25 : 0.49342
num topics: 3 , passes: 15 : 0.42102


In [49]:
display(Markdown("## " + business_name))
display(HTML(tabulate(results, tablefmt='html', headers=[str(x) + ' topics' for x in list_num_topics])))

## In-N-Out Burger

3 topics,4 topics,5 topics,6 topics,7 topics,8 topics
0.42102,0.43359,,0.48026,,0.47088
,0.46966,0.49687,0.48972,0.47495,0.47927
0.43387,0.48422,0.50537,0.48777,0.49342,0.49326


# Display Model Topics

In [20]:
import re

In [33]:
for num_topics, num_passes in [(3, 20), (5, 20), (8, 20)]:
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=dictionary,
                                                        num_topics=num_topics, 
                                                        random_state=1,
                                                        chunksize=500,
                                                        passes=num_passes,
                                                        workers=7,
                                                        per_word_topics=True)

    results = []
    for _, text in lda_model.print_topics():
        results.append([re.findall('[a-z]+', x)[0] for x in text.split(' + ')])
    display(Markdown("# " + business_name + ": " + str(num_passes) + " passes, " + str(num_topics) + " topics"))
    display(HTML(tabulate(results, tablefmt="html", headers=["Term " + str(x) for x in range(1, 11)])))

# The Cheesecake Factory: 20 passes, 3 topics

Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10
order,wait,time,minut,ask,tabl,came,server,manag,servic
great,good,servic,factori,menu,time,place,restaur,wait,locat
chicken,good,order,salad,like,chees,pasta,factori,sauc,fri


# The Cheesecake Factory: 20 passes, 5 topics

Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10
great,servic,good,place,factori,time,wait,locat,like,restaur
menu,factori,good,restaur,time,love,great,like,order,servic
chicken,good,order,salad,chees,like,pasta,fri,sauc,factori
order,time,server,manag,ask,servic,came,factori,locat,like
wait,minut,tabl,time,order,seat,ask,came,hour,server


# The Cheesecake Factory: 20 passes, 8 topics

Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10
great,servic,good,place,factori,amaz,love,best,like,experi
menu,factori,good,restaur,great,love,time,servic,locat,like
chicken,good,order,pasta,salad,love,sauc,great,dish,factori
order,time,server,locat,servic,manag,ask,factori,came,drink
tabl,wait,ask,minut,came,manag,order,said,server,time
order,good,cake,chocol,like,got,pasta,bread,slice,time
order,like,good,salad,burger,fri,chicken,steak,got,meal
wait,time,hour,long,minut,good,order,seat,servic,place
