In [1]:
import pandas as pd 
import numpy as np
import json
import re
from tqdm._tqdm_notebook import tqdm_notebook

# Load Data

The data from Yelp reviews is in the files `review.json` and `business.json`, which were downloaded from the following link (the link also contains detailed documentation on the structure and format of the dataset):

https://www.yelp.com/dataset/documentation/json

In [5]:
PATH = '../data/'

In [6]:
with open(PATH+'review.json') as f:
    review_data = pd.DataFrame(json.loads(line) for line in f)

KeyboardInterrupt: 

In [None]:
with open(PATH+'business.json') as f:
    business_data = pd.DataFrame(json.loads(line) for line in f)

The reviews are related to several different types of businesses - we first filter the reviews down to only those related to restaurants.

In [167]:
rest_biz = business_data[business_data['categories'].apply(str).str.contains('Restaurants')]

In [168]:
rest_biz = rest_biz.drop(columns=['stars'])

In [169]:
rest_biz.sort_values(by='review_count', ascending=False, inplace=True)

In [170]:
rest_biz.groupby(by=['state'])['review_count'].sum().sort_values(ascending=False)

state
NV     949953
AZ     837216
ON     414411
NC     180487
OH     154726
PA     143283
QC      98978
WI      69050
BW      24934
EDH     23747
IL      22186
SC       5981
MLN      1101
HLD       589
C         168
ELN       117
FIF       110
NYK       101
WLN        87
NY         73
NI         58
WA         40
01         24
PKN        24
ST         24
ESX        11
BY         10
KHL         7
RCC         7
XGL         6
3           5
HH          4
CA          4
WHT         4
ABE         3
ZET         3
Name: review_count, dtype: int64

In [171]:
preprocessed_data = rest_biz.merge(review_data,how='inner')
preprocessed_data = preprocessed_data[preprocessed_data['state'].isin(['NV','AZ'])]

In [115]:
date_sorted = preprocessed_data.sort_values(by='date',ascending=False)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,...,review_count,state,cool,date,funny,review_id,stars,text,useful,user_id
625283,"3930 Las Vegas Blvd S, Ste 120","{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",lKq4Qsz13FDcAVgp49uukQ,"[Nightlife, Food, Italian, Bars, Restaurants, ...",Las Vegas,"{'Monday': '11:00-23:00', 'Tuesday': '11:00-23...",1,36.091262,-115.174247,Slice of Vegas,...,496,NV,0,2017-07-26,0,Qb8v6aB8dSE4YC3lm7cVBQ,4,"We originally wanted to eat next door, but the...",0,AiV8Qo73R09Eroj5LNJ74g
1511598,8000 S Priest Rd,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",8PBq-MHyLDrhkaTf72Foww,"[Nightlife, Restaurants, Bars, Mexican, Dive B...",Tempe,"{'Monday': '11:00-2:00', 'Tuesday': '11:00-2:0...",1,33.344151,-111.963680,Dos Gringos,...,164,AZ,0,2017-07-26,2,t0O0rhXTngxpeQxJtNHmtA,3,"The Guac and Chips are the best, I prefer the ...",0,mcheRwzAoaFKYqMlJn5Pig
1785108,5436 S Central Ave,"{'RestaurantsTableService': False, 'GoodForMea...",najluTSbgZEpCdLIbpvnxQ,"[Seafood, Fish & Chips, Sandwiches, Chicken Wi...",Phoenix,"{'Monday': '11:00-21:00', 'Tuesday': '11:00-21...",1,33.396343,-112.073687,Maryln's Fish & Chips,...,115,AZ,0,2017-07-26,0,Qtx6Jlespb99fxlv3ilR1Q,5,O'Shea has got to be the friendliest most atte...,2,A0vk6LeKbmyWWeYtPKkPyg
905222,"790 Coronado Center Dr, Ste 125","{'Alcohol': 'beer_and_wine', 'HasTV': True, 'N...",3oajqiPFhYQJsHHiVCchEQ,"[Restaurants, Sushi Bars, Japanese]",Henderson,"{'Monday': '11:30-4:00', 'Tuesday': '11:30-4:0...",1,36.002946,-115.107756,Sushi Wa,...,337,NV,0,2017-07-26,0,RK3Cx_I3L8fQIRxpNYsW2A,4,Man..... Sushi Wa... You have sushi that makes...,0,tLDY5ICCsrvY00kqAHZ2dg
85612,3950 S Las Vegas Blvd,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",Cni2l-VKG_pdospJ6xliXQ,"[Bars, Nightlife, Burgers, American (New), Res...",Las Vegas,"{'Monday': '11:00-23:00', 'Tuesday': '11:00-23...",1,36.094460,-115.176113,Burger Bar,...,2396,NV,1,2017-07-26,1,HhzW89-4Hhxe39McTide6Q,2,Yea it's nothing special. Heard that it used t...,1,DpT4ed85EGsNBOEZvdhfsw
1894724,451 N Old Litchfield Rd,"{'RestaurantsTableService': True, 'GoodForMeal...",IbsRfrHGK4g5OQd_KFtdkQ,"[Restaurants, American (Traditional), Breakfas...",Litchfield Park,"{'Monday': '11:00-22:00', 'Tuesday': '11:00-22...",1,33.498528,-112.357869,Red's Bar & Grill,...,101,AZ,0,2017-07-26,0,iKJLhclGVkvRaXqfXjKc8w,4,Great ambiance on a Friday HH.. our first time...,0,7sr9QjhdaApsd4KbhvQAAw
425164,"3635 Las Vegas Blvd S, Unit 100","{'RestaurantsTableService': True, 'GoodForMeal...",ysv6yhVYOoH9Pf7PlMyD0g,"[Restaurants, American (Traditional), Burgers]",Las Vegas,"{'Monday': '8:00-2:00', 'Tuesday': '8:00-2:00'...",1,36.113826,-115.171499,Wahlburgers,...,720,NV,0,2017-07-26,0,kAOVpJvToH-aoY2RvDcUBA,2,Yeah I watch the show and that's what drew me ...,1,BeWDEj2Lj9L31HtFfZGn1w
2424452,"7250 S Durango Dr, Ste 160","{'RestaurantsTableService': False, 'GoodForMea...",kE8hSuvO6eKtbRA1k541Jg,"[Delis, Restaurants, Sandwiches, Fast Food]",Las Vegas,"{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...",1,36.056414,-115.278382,Firehouse Subs,...,42,NV,0,2017-07-26,0,_y1dpekAUgrblWssg2cGIQ,1,I ate at the firehouse sub in spring valley an...,0,ASehnTl6cZcq70qZuAvUaw
2057328,3241 E Shea Blvd,"{'Alcohol': 'full_bar', 'HasTV': False, 'Noise...",IQVka-wzJmN3jw2W70aCzw,"[Italian, Restaurants]",Phoenix,"{'Tuesday': '17:00-21:00', 'Friday': '17:00-21...",1,33.582071,-112.011458,Michelina's,...,80,AZ,0,2017-07-26,0,sqHXQOJcCUKNT7H6kis_PA,5,I love this place it is locally owned who is o...,0,HXWCmYxRWbeluttyLILCxQ
1492160,3410 S Jones Blvd,"{'RestaurantsTableService': True, 'GoodForMeal...",Cy8XYYDrZ5wd3Bq-toXMsg,"[Restaurants, Tapas/Small Plates, Japanese]",Las Vegas,"{'Monday': '17:30-2:30', 'Tuesday': '17:30-2:3...",1,36.127713,-115.224648,Hachi,...,168,NV,1,2017-07-26,1,SX_1RLDYSZJhHVCLIXVCZg,3,Ichiza is better. The only thing I like was t...,1,86AFTm6d4S85eFUK9gcG_Q


In [15]:
data_20150101 = date_sorted[date_sorted['date']>'2017-01-01']

NameError: name 'date_sorted' is not defined

In [137]:
data_20150101 = data_20150101.reset_index().drop('index',axis=1)

In [139]:
data_20150101.shape

(950204, 22)

In [140]:
preprocessed_data = data_20150101.loc[:,['business_id','user_id','text']]

In [142]:
# The data for this analysis include restaurant reviews across two states Nevada and Arizona for the past 2.5 years

In [141]:
preprocessed_data.to_feather('./data/preprocessed_data')

Tokenization 

In [1]:
import pandas as pd 
import numpy as np
import json
import re
from tqdm._tqdm_notebook import tqdm_notebook,
import spacy

In [8]:
preprocessed_data = pd.read_feather('preprocessed_data')

In [9]:
pd.read_feather('preprocessed_data').shape

(241221, 3)

In [15]:
nlp = spacy.load('en')

In [16]:
def clean_up_spacy(text):
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out

In [12]:
 
preprocessed_data['words'] = preprocessed_data['text'].progress_apply(lambda x: clean_up_spacy(x))




In [13]:
preprocessed_data.to_csv('./processed_data')

In [3]:
preprocessed_data = pd.read_csv('./processed_data')

In [4]:
preprocessed_data['doc_len'] = preprocessed_data['words'].map(len)

In [12]:
preprocessed_data.drop('Unnamed: 0',axis=1,inplace=True)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

In [17]:
# min_df is minimum number of documents that contain a term t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above
vectorizer = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=clean_up_spacy, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(preprocessed_data['text']))

In [18]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [19]:
tfidf.sort_values(by='tfidf',ascending=False)

Unnamed: 0,tfidf
ozzy,9.927737
butterfly,9.011446
yes yes,8.962656
poké,8.962656
taiyaki,8.950822
shabu,8.939126
maggiano,8.850178
bouchon,8.818762
halo halo,8.818762
nora,8.808505


In [23]:
import lda
from sklearn.feature_extraction.text import CountVectorizer
import logging
logging.getLogger("lda").setLevel(logging.WARNING)

In [57]:
cvectorizer = CountVectorizer(min_df=4, max_features=1000, tokenizer=clean_up_spacy, ngram_range=(1,2))
cvz = cvectorizer.fit_transform(list(preprocessed_data['text']))

In [68]:
n_topics = 5
n_iter = 500
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [93]:
n_top_words = 5
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' + '.join(topic_words)))

Topic 0: great + place + food + good + love
Topic 1: good + chicken + sauce + try + order
Topic 2: good + place + like + come + time
Topic 3: order + food + good + come + time
Topic 4: taco + chip + die + mexican + und


LDA Trial 2

In [20]:
useless_words = tfidf.sort_values(by='tfidf',ascending=False).reset_index()
custom_stopwords = useless_words[:-(101):-1]['index'].values.tolist()

In [21]:
from spacy.lang.en.stop_words import STOP_WORDS
for sw in custom_stopwords:
    STOP_WORDS.add(sw)
    nlp.vocab[sw].is_stop = True

In [24]:
cvectorizer = CountVectorizer(min_df=4, max_features=10000, tokenizer=clean_up_spacy, ngram_range=(1,2))
cvz = cvectorizer.fit_transform(list(preprocessed_data['text']))

In [35]:
import _pickle as cPickle
with open('cvectorizer.pk', 'wb') as fin:
    cPickle.dump(cvectorizer, fin)

In [37]:
with open('tfidfvectorizer.pk', 'wb') as fin1:
    cPickle.dump(vectorizer, fin1)

In [38]:
n_topics = 50
n_iter = 1500
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [53]:
with open('topicwords.csv','wb') as f:
    topic_word.tofile('topicwords')

In [41]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' + '.join(topic_words)))

Topic 0: item + offer + dining + restaurant + high + option + serve + make + include + thing
Topic 1: onion + pepper + green + tomato + sweet + salt + serve + red + add + bit
Topic 2: wing + pasta + italian + bread + garlic + order + meatball + good + spaghetti + olive
Topic 3: portion + huge + size + large + half + plate + order + come + generous + share
Topic 4: different + item + try + variety + dish + option + indian + choose + flavor + choice
Topic 5: music + fun + drink + game + play + loud + live + watch + cool + bartender
Topic 6: thank + attentive + care + take + come + helpful + good + wonderful + owner + make
Topic 7: say + ask + walk + go + guy + lady + girl + tell + counter + rude
Topic 8: breakfast + egg + pancake + toast + bacon + coffee + hash + french + brunch + potato
Topic 9: wine + course + dining + dessert + glass + view + chef + beautiful + dish + appetizer
Topic 10: hot + quick + fast + fish + dog + come + chip + clean + bite + stop
Topic 11: coffee + tea + ice +

In [42]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(preprocessed_data['doc_len']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [43]:
ldadata = prepareLDAData()

In [46]:
import pyLDAvis

In [47]:
pyLDAvis.enable_notebook()

In [48]:
prepared_data = pyLDAvis.prepare(**ldadata)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [49]:
pyLDAvis.save_html(prepared_data,'./pyldadavis.html')