# Build Corpus

In [1]:
import pandas as pd
import pickle
import spacy
from tqdm import tqdm_notebook

tqdm_notebook().pandas()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

import gensim

from gensim.models.coherencemodel import CoherenceModel

# skip = {'$', 'CD', "POS", "BES", "ADD", "DT", "HVS"}

# def tokenise(text):
#     if type(text) != str:
#         return []
#     text = text.replace("\n\n", ".").replace("\n", ".").replace('.', '. ')
#     while "  " in text:
#         text = text.replace("  ", " ")
#     doc = nlp(text, disable=["parser", "ner", "textcat"])
#     tokens = [token.lemma_ for token in doc if (not token.is_stop and not token.is_punct and token.tag_ not in skip)]
#     return tokens

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






In [17]:
business_name = "In-n-Out Burger"
df = pd.read_parquet("data/" + business_name + '_reviews.parquet')
tokens = df.text.progress_apply(gensim.parsing.preprocess_string)
tokens = list(tokens)

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))




In [61]:
# dictionary = gensim.corpora.Dictionary.load("processed_data/dictionary")
# dictionary = gensim.corpora.Dictionary.load("processed_data/" + business_name + "_dictionary")
# dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n = None)

In [18]:
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=None)
# dictionary.save("processed_data/"+business_name+"_dictionary")

corpus = [0] * len(tokens)

for i in tqdm_notebook(range(len(tokens))):
    corpus[i] = dictionary.doc2bow(tokens[i])

# with open("processed_data/"+business_name+"_corpus.pkl", "wb") as f:
#     pickle.dump(corpus, f)

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))




In [63]:
# dictionary = gensim.corpora.Dictionary.load("processed_data/" + business_name + "_dictionary")
# with open("processed_data/" + business_name + "_corpus.pkl", "rb") as f:
#     corpus = pickle.load(f)

# Model Building & Evaluation

In [11]:
import random
from tabulate import tabulate
from IPython.display import display, HTML, Markdown

In [19]:
list_num_topics = [2, 3, 5, 8, 10]
list_passes = [1, 5, 10, 15, 20]
done = set()

results = []
for i in range(len(list_passes)):
    results.append([" "] * len(list_num_topics))

In [None]:
while len(done) < len(list_num_topics) * len(list_passes) * 0.6:
    topics_index = random.choice(range(len(list_num_topics)))
    passes_index = random.choice(range(len(list_passes)))
    num_topics = list_num_topics[topics_index]
    passes = list_passes[passes_index]
    tup = (num_topics, passes)
    if tup in done:
        continue
    coherence = 0
    num_tries = 1
    for i in range(num_tries):
        lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                            id2word=dictionary,
                                                            num_topics=num_topics, 
                                                            random_state=i,
                                                            chunksize=500,
                                                            passes=passes,
                                                            workers=3,
                                                            per_word_topics=True)
        cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v')
        coherence += cm.get_coherence()
    done.add(tup)
    coherence = round(coherence / num_tries, 5)
    print("num topics:", num_topics, ", passes:", passes, ":", coherence)
    results[passes_index][topics_index] = coherence

num topics: 5 , passes: 1 : 0.40685
num topics: 3 , passes: 1 : 0.39361
num topics: 3 , passes: 10 : 0.48892
num topics: 2 , passes: 1 : 0.37219
num topics: 5 , passes: 15 : 0.51645
num topics: 3 , passes: 20 : 0.52528
num topics: 2 , passes: 10 : 0.47263


In [None]:
display(Markdown("## " + business_name))
display(HTML(tabulate(results, tablefmt='html', headers=[str(x) + ' topics' for x in list_num_topics])))

In [104]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                    id2word=dictionary,
                                                    num_topics=5, 
                                                    random_state=1,
                                                    chunksize=500,
                                                    passes=10,
                                                    workers=3,
                                                    per_word_topics=True)

In [6]:
lda_model.save('models/lda/' + business_name)

NameError: name 'model' is not defined

# Display Model Topics

In [67]:
import re

In [68]:
# business_name = "Shake Shack"
# lda_model = gensim.models.ldamulticore.LdaMulticore.load('models/lda/' + business_name)

for _, text in lda_model.print_topics():
#     print([x for x in text.split(' + ')])
    print([re.findall('[a-z]+', x)[0] for x in text.split(' + ')])