# Build Dictionary & Corpus

In [58]:
import pickle
import random
import re

import gensim
import pandas as pd
from gensim.models.coherencemodel import CoherenceModel
from IPython.display import HTML, Markdown, display
from tabulate import tabulate
from tqdm import tqdm_notebook

tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [59]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

skip = {'$', 'CD'}
def tokenise(text):
    if type(text) != str:
        return []
    text = text.replace('\n\n', '.').replace('\n', '.').replace('!', '.').replace('?', '.').replace('.', '. ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    doc = nlp(text, disable=["parser", "ner", "textcat"])
    tokens = []
    for token in doc:
        if (not token.is_punct and token.tag_ not in skip):
            if token.lemma_ == '-PRON-':
                tokens.append(token.text.lower())
            else:
                tokens.append(token.lemma_)
    return tokens

In [60]:
def tokenise(text):
    skip = {'$', 'CD', "POS"}
    if type(text) != str:
        return []
    text = text.replace('\n\n', '.').replace('\n', '.').replace('.', '. ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    doc = nlp(text, disable=["parser", "ner", "textcat"])
    tokens = []
    for token in doc:
        if (not token.is_stop and not token.is_punct and token.tag_ not in skip and len(token) > 2):
            if token.lemma_ == '-PRON-':
                tokens.append(token.text.lower())
            else:
                tokens.append(token.lemma_)
    return tokens

In [125]:
business_name = "In-N-Out Burger"
df = pd.read_parquet("data/" + business_name + '_reviews.parquet')
#tokens = df.text.progress_apply(gensim.parsing.preprocess_string)
tokens = df.text.progress_apply(tokenise)
tokens = list(tokens)

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))

In [126]:
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=None)

corpus = [0] * len(tokens)

for i in tqdm_notebook(range(len(tokens))):
    corpus[i] = dictionary.doc2bow(tokens[i])

HBox(children=(IntProgress(value=0, max=6919), HTML(value='')))

# Model Building & Evaluation

In [74]:
list_num_topics = [2, 4, 6, 8, 10]
list_passes = [10, 15, 20, 25, 30]
done = set()

results = []
for i in range(len(list_passes)):
    results.append([str(list_passes[i]) + " passes"] + [" "] * len(list_num_topics))

In [75]:
# while len(done) < len(list_num_topics) * len(list_passes) * 0.8:
#     topics_index = random.choice(range(len(list_num_topics)))
#     passes_index = random.choice(range(len(list_passes)))
for topics_index in tqdm_notebook(range(len(list_num_topics))):
    for passes_index in range(len(list_passes)):
        num_topics = list_num_topics[topics_index]
        passes = list_passes[passes_index]
#         tup = (num_topics, passes)
#         if tup in done:
#             continue
        coherence = 0
        num_tries = 1
        for i in range(num_tries):
            lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                                id2word=dictionary,
                                                                num_topics=num_topics, 
                                                                random_state=i,
                                                                chunksize=500,
                                                                passes=passes,
                                                                workers=7,
                                                                per_word_topics=True)
            cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v')
            coherence += cm.get_coherence()
#         done.add(tup)
        coherence = round(coherence / num_tries, 5)
        print("num topics:", num_topics, ", passes:", passes, ":", coherence)
        results[passes_index][topics_index+1] = coherence

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

num topics: 2 , passes: 10 : 0.40033
num topics: 2 , passes: 15 : 0.39726
num topics: 2 , passes: 20 : 0.40255
num topics: 2 , passes: 25 : 0.40255
num topics: 2 , passes: 30 : 0.41561
num topics: 4 , passes: 10 : 0.43407
num topics: 4 , passes: 15 : 0.44121
num topics: 4 , passes: 20 : 0.45252
num topics: 4 , passes: 25 : 0.45827
num topics: 4 , passes: 30 : 0.4627
num topics: 6 , passes: 10 : 0.46261
num topics: 6 , passes: 15 : 0.48628
num topics: 6 , passes: 20 : 0.48428
num topics: 6 , passes: 25 : 0.49948
num topics: 6 , passes: 30 : 0.50265
num topics: 8 , passes: 10 : 0.45979
num topics: 8 , passes: 15 : 0.47176
num topics: 8 , passes: 20 : 0.47287
num topics: 8 , passes: 25 : 0.47892
num topics: 8 , passes: 30 : 0.48082
num topics: 10 , passes: 10 : 0.45708
num topics: 10 , passes: 15 : 0.46277
num topics: 10 , passes: 20 : 0.47434
num topics: 10 , passes: 25 : 0.48171
num topics: 10 , passes: 30 : 0.48636


In [76]:
display(Markdown("## " + business_name))
display(HTML(tabulate(results, tablefmt='html', headers=["No. Pass"] + [str(x) + ' topics' for x in list_num_topics])))

## In-N-Out Burger

No. Pass,2 topics,4 topics,6 topics,8 topics,10 topics
10 passes,0.40033,0.43407,0.46261,0.45979,0.45708
15 passes,0.39726,0.44121,0.48628,0.47176,0.46277
20 passes,0.40255,0.45252,0.48428,0.47287,0.47434
25 passes,0.40255,0.45827,0.49948,0.47892,0.48171
30 passes,0.41561,0.4627,0.50265,0.48082,0.48636


# Display Model Topics

In [127]:
for num_topics in [6, 8, 10]:
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=dictionary,
                                                        num_topics=num_topics, 
                                                        random_state=1,
                                                        chunksize=500,
                                                        passes=30,
                                                        workers=7,
                                                        per_word_topics=True)

    results = []
    topic_num = 1
    for _, text in lda_model.print_topics():
        results.append(["Topic " + str(topic_num)] + [re.findall('[a-z]+', x)[0] for x in text.split(' + ')])
        topic_num += 1
    
    dist = {}
    
    for c in corpus:
        topic_distribution = lda_model.get_document_topics(c)
        for topic_index, distribution in topic_distribution:
            if topic_index not in dist:
                dist[topic_index] = 0
            dist[topic_index] += distribution
    
    for topic_index, distribution in dist.items():
        results[topic_index].append("%.2f" % (distribution * 100 / len(corpus)) + "%")
        
    display(Markdown("# " + business_name + ": " + str(num_topics) + " topics"))
    display(HTML(tabulate(results, tablefmt="html", headers=["Topic #"] + ["Term " + str(x) for x in range(1, 11)] + ["Dist."])))

# In-N-Out Burger: 6 topics

Topic #,Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10,Dist.
Topic 1,like,taste,fast,shake,eat,try,fresh,think,have,time,16.51%
Topic 2,vegas,location,fast,this,have,time,strip,fresh,visit,menu,13.16%
Topic 3,service,great,friendly,location,staff,clean,fast,this,busy,love,19.64%
Topic 4,style,double,animal,onion,cheese,grill,order,menu,shake,secret,20.19%
Topic 5,line,wait,long,drive,love,fast,time,eat,like,order,13.63%
Topic 6,order,time,get,go,drive,location,come,like,take,wait,15.64%


# In-N-Out Burger: 8 topics

Topic #,Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10,Dist.
Topic 1,like,taste,bun,patty,meat,fresh,eat,shake,lettuce,try,10.07%
Topic 2,location,time,strip,wait,vegas,close,this,fresh,have,get,7.50%
Topic 3,service,great,friendly,location,staff,clean,fast,this,busy,love,18.24%
Topic 4,style,animal,double,onion,cheese,grill,order,extra,shake,menu,15.34%
Topic 5,line,wait,long,love,drive,fast,order,eat,fresh,like,10.45%
Topic 6,order,drive,go,time,get,location,take,come,wait,like,13.22%
Topic 7,vegas,coast,double,time,west,east,try,eat,animal,come,10.36%
Topic 8,fast,menu,like,fresh,price,secret,quality,have,chain,eat,12.74%


# In-N-Out Burger: 10 topics

Topic #,Term 1,Term 2,Term 3,Term 4,Term 5,Term 6,Term 7,Term 8,Term 9,Term 10,Dist.
Topic 1,like,taste,bun,patty,fresh,meat,lettuce,shake,sauce,eat,7.84%
Topic 2,time,have,be,vegas,visit,love,location,eat,live,california,5.40%
Topic 3,service,great,friendly,fast,staff,clean,location,customer,employee,this,14.90%
Topic 4,animal,style,double,shake,vegas,cheese,menu,get,order,secret,7.54%
Topic 5,love,fresh,animal,style,menu,fast,try,simple,eat,shake,9.94%
Topic 6,order,go,get,time,drive,come,take,like,ask,eat,11.49%
Topic 7,coast,west,east,time,try,know,eat,visit,this,come,6.19%
Topic 8,fast,like,menu,people,have,chain,eat,think,joint,price,9.86%
Topic 9,location,line,wait,long,drive,busy,this,time,order,get,13.42%
Topic 10,double,style,animal,onion,grill,order,cheese,extra,protein,like,10.73%


# Building Word2Vec Model

## Building model from reviews

In [2]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases

In [125]:
df1 = pd.read_parquet("data/Shake Shack_reviews.parquet")
df2 = pd.read_parquet("data/In-N-Out Burger_reviews.parquet")
df3 = pd.read_parquet("data/The Cheesecake Factory_reviews.parquet")

In [135]:
len(tokens)

18314

In [132]:
tokens = df1.text.append(df2.text).append(df3.text).progress_apply(tokenise)
tokens = list(tokens)

HBox(children=(IntProgress(value=0, max=18314), HTML(value='')))




In [133]:
# common_terms = ["of", "with", "without", "and", "or", "the", "a"]
# Create the relevant phrases from the list of sentences:
phrases = Phrases(tokens)
# The Phraser object is used from now on to transform sentences
bigram = Phraser(phrases)
# Applying the Phraser to transform our sentences is simply
new_tokens = list(bigram[tokens])

In [136]:
model = gensim.models.Word2Vec(
    new_tokens,
    sg=1,
    size=150,
    window=7,
    min_count=3,
    workers=7)
model.train(new_tokens, total_examples=len(new_tokens), epochs=20)

(23923760, 36534080)

In [164]:
model.save("models/word2vec_3biz")

In [138]:
model.wv.most_similar(positive='service')

[('customer_service', 0.6847555637359619),
 ('staff', 0.5711835026741028),
 ('dante', 0.5680903792381287),
 ('tentative', 0.5398862361907959),
 ('heather', 0.5392563343048096),
 ('brenda', 0.5234328508377075),
 ('consistant', 0.518193244934082),
 ('food', 0.5143879652023315),
 ('absolutly', 0.5086746215820312),
 ('ethic', 0.5067006349563599)]

In [144]:
model.wv.save_word2vec_format("models/w2v.txt")

In [143]:
model.wv.save_word2vec_format('models/w2v', binary=True)

In [160]:
nlp = spacy.blank('en')

keys = []
for idx in range(len(model.wv.vocab)):
    keys.append(model.wv.index2word[idx])

nlp.vocab.vectors = spacy.vocab.Vectors(data=model.wv.syn0, keys=keys)

  import sys


## Loading model from disk

In [128]:
model = Word2Vec.load('models/w2v.obj')

In [129]:
model.wv.most_similar(positive=['service'])

[('service-', 0.7543092966079712),
 ('Service', 0.7183818817138672),
 ('sevice', 0.678348183631897),
 ('waitstaff', 0.6351033449172974),
 ('staff', 0.6129365563392639),
 ('serivce', 0.5980651378631592),
 ('-service', 0.5541418194770813),
 ('Waitstaff', 0.5533532500267029),
 ('Server', 0.5380331873893738),
 ('SERVICE', 0.5359606742858887)]

In [18]:
nlp = spacy.blank('en')

keys = []
for idx in range(len(model.wv.vocab)):
    keys.append(model.wv.index2word[idx])

nlp.vocab.vectors = spacy.vocab.Vectors(data=model.wv.syn0, keys=keys)

  import sys


In [20]:
nlp('reasonably priced')

reasonably priced

In [15]:
bigram = Phraser.load('models/bigram.obj')

In [55]:
list(bigram[["Customer", "Service"]])

TypeError: 'float' object is not subscriptable