In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import json

# import restaurant data from files

business_file_path = "../yelp_dataset/yelp_academic_dataset_business.json"

business_info_dict = {}
with open(business_file_path, 'rb') as business_data:
    for b in business_data:
        b_dict = json.loads(b)

        bid = b_dict['business_id']
        category = b_dict["categories"]
        if (category is not None and "Restaurants" in category):
            business_info_dict[bid] = b_dict

business_info_df = pd.DataFrame(business_info_dict.values())

In [4]:
# import review data from the restaurants we imported prior

review_file_path = "../yelp_dataset/yelp_academic_dataset_review.json"
restaurant_ids = business_info_dict.keys()
review_info_dict = {}

with open(review_file_path, 'rb') as review_data:
	for r in review_data:
		r_dict = json.loads(r)

		bid = r_dict['business_id']
		# print(bid)
		if (bid in restaurant_ids): # this is a restaurant
			if (bid in review_info_dict):
				review_info_dict[bid].append(r_dict)
			else:
				review_info_dict[bid] = [r_dict]

print(len(review_info_dict.values()))

50763


In [5]:
# convert reviews to dataframe
temp_list = [review for reviews in list(review_info_dict.values())[:100] for review in reviews]
review_info_df = pd.DataFrame(temp_list)

print(review_info_df.head())
print(len(review_info_df))

                review_id                 user_id             business_id  \
0  lWC-xP3rd6obsecCYsGZRg  ak0TdVmGKo4pwqdJSTLwWw  buF9druCkbuXLX526sGELQ   
1  hpcZLEzqD4_gPi6eSVi_Bg  Y-j2svl0M_5-jF1ehYuNPQ  buF9druCkbuXLX526sGELQ   
2  3FvY1Se8y2WXqTbaANOqMw  xUCX4GhBpeWxZB0l2lmt_w  buF9druCkbuXLX526sGELQ   
3  C1uQNP2ehBktS43ZRMEvkg  2M6KFsWIUXElqcQRz4A0Qg  buF9druCkbuXLX526sGELQ   
4  Cja8_35_kQDnF9g4voikzw  t5SRIRU6INiAyVkiMJhRPA  buF9druCkbuXLX526sGELQ   

   stars  useful  funny  cool  \
0    4.0       3      1     1   
1    2.0       1      1     1   
2    5.0       3      0     0   
3    5.0       1      2     0   
4    1.0       0      0     0   

                                                text                 date  
0  Apparently Prides Osteria had a rough summer a...  2014-10-11 03:34:02  
1  I was really disappointed to say the least. I ...  2014-11-30 00:24:42  
2  This is as close to dining in Italy as you'll ...  2014-04-19 16:48:28  
3  Great food and service! Again. 

In [6]:
# split reviews into sentences

review_info_df = review_info_df.drop(['user_id', 'stars', 'useful', 'funny', 'cool', 'date'], axis=1)
review_info_df['text'] = [nltk.tokenize.sent_tokenize(review) for review in review_info_df['text']]
review_info_df = review_info_df.explode('text')
review_info_df['sentence_id'] = [i for i in range(len(review_info_df))]
review_info_df.head()

Unnamed: 0,review_id,business_id,text,sentence_id
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Apparently Prides Osteria had a rough summer a...,0
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,However new blood in the kitchen seems to have...,1
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Waitstaff was warm but unobtrusive.,2
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,By 8 pm or so when we left the bar was full an...,3
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Perhaps Beverly residents prefer a later seating.,4


In [7]:
# Pre Processing
# lower case
print('converting to lowercase')
review_info_df['pre_process'] = review_info_df['text'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))
print('done')

# remove html tabs
from bs4 import BeautifulSoup
print('removing html')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
import re
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: re.sub(r'http\S+', '', x))
print('done')

# replace contractions
def contractions(s):
 s = re.sub(r'won\'t', 'will not',s)
 s = re.sub(r'would\'t', 'would not',s)
 s = re.sub(r'could\'t', 'could not',s)
 s = re.sub(r'\'d', ' would',s)
 s = re.sub(r'can\'t', 'can not',s)
 s = re.sub(r'n\'t', ' not', s)
 s = re.sub(r'\'re', ' are', s)
 s = re.sub(r'\'s', ' is', s)
 s = re.sub(r'\'ll', ' will', s)
 s = re.sub(r'\'t', ' not', s)
 s = re.sub(r'\'ve', ' have', s)
 s = re.sub(r'\'m', ' am', s)
 return s

print('removing contractions')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x:contractions(x))
print('done')

# remove non alpha chars
print('removing non alpha chars')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: ' '.join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
print('done')

# remove extra spaces
print('removing extra whitespace')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: re.sub(' +', ' ', x))
print('done')

# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
print('remove stop words')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: ' '.join([x for x in x.split() if x not in stop]))
print('done')

# perform lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print('perform lemmatization')
review_info_df['pre_process']=review_info_df['pre_process'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
print('done')

converting to lowercase
done
removing html


  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


done
removing contractions
done
removing non alpha chars
done
removing extra whitespace
done
remove stop words
done
perform lemmatization
done


In [8]:
review_info_df.head()

Unnamed: 0,review_id,business_id,text,sentence_id,pre_process
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Apparently Prides Osteria had a rough summer a...,0,apparently pride osteria rough summer evidence...
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,However new blood in the kitchen seems to have...,1,however new blood kitchen seems revitalized fo...
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Waitstaff was warm but unobtrusive.,2,waitstaff warm unobtrusive
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,By 8 pm or so when we left the bar was full an...,3,pm left bar full dining room much lively
0,lWC-xP3rd6obsecCYsGZRg,buF9druCkbuXLX526sGELQ,Perhaps Beverly residents prefer a later seating.,4,perhaps beverly resident prefer later seating


In [16]:
# Split data into test and training data

from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test,Z_train,Z_test = train_test_split(review_info_df['pre_process'], review_info_df['review_id'], review_info_df['business_id'], test_size=0.25, random_state=30)

print('Train: ',X_train.shape,Y_train.shape,Z_train.shape,'Test: ',(X_test.shape,Y_test.shape,Z_test.shape))

Train:  (338406,) (338406,) (338406,) Test:  ((112803,), (112803,), (112803,))


In [17]:
df_test = pd.DataFrame()
df_test['pre_process'] = X_test
df_test['review_id'] = Y_test
df_test['business_id'] = Z_test
df_test.head(20)

Unnamed: 0,pre_process,review_id,business_id
5845,soo good,Vsp28fFj6Mr7aIQuaOA3ZA,KXCXaF5qimmtKKqnPc_LQA
3166,went flip burger ordered slider sampler go,hjIeMAv4Tqhiu5jypRxqVg,TA1KUSCu8GkWP9w0rmElxw
6191,walk burrard station really accessible,wCEwIk9voZQwKh3qECwkxw,KXCXaF5qimmtKKqnPc_LQA
4267,went last night dinner stay longer,8gZj00zVRZkm0xKsdn4vyQ,TA1KUSCu8GkWP9w0rmElxw
10124,definitely back,I1-MJRkhBxm2s1IkeTbBsQ,dWBKYjQ3q2v2dOjsfLLxDg
3674,say skip milkshake eat salty crunchy appetizer,_4K0O_eqE2Q7OKj2POQziQ,TA1KUSCu8GkWP9w0rmElxw
11620,pickle superb,_s01FQA9Ps7io0Upr6SYJw,p2BkIrOuIsxGqtV0lwOZUw
23256,like drink condiment selection well,Uhwg4gO_AoIcRRI_OPE3cw,bZiIIUcpgxh8mpKMDhdqbA
44405,really like green bean,sEwkxqddMFLqkzCk42088Q,YZs1gNSh_sN8JmN_nrpxeA
39881,got braised beef tortellini got angel hair fou...,WvCsKHj4_oFCHPFxE94Ldw,gHvqhb6TjIupKZyot0DIgA


In [31]:
dict_sentence_df = {k:v for k,v in df_test.groupby('business_id')}
dict_sentence_df.items()

dict_items([('-L69Ix0-xX4BlHA61fGvrQ',                                              pre_process  \
33361              recommended want something quick fast   
33347  word mouth advertising worst best depending cu...   
33312                           twice seem mix order lot   
33317                                                      
33309                         chicken express right year   
...                                                  ...   
33350                                       disorganized   
33314                                 chicken cross road   
33340                                   meal count fresh   
33323                                  told maybe minute   
33372  managerbossowner right left car would said som...   

                    review_id             business_id  
33361  f-Qwn-lnA6GkTz7TJLP3_A  -L69Ix0-xX4BlHA61fGvrQ  
33347  WDMPBupC_u587TCw-UZb1g  -L69Ix0-xX4BlHA61fGvrQ  
33312  2_-BZ_ciRtGO2qLGb_czYw  -L69Ix0-xX4BlHA61fGvrQ  
33317  aH4ERExvL

In [35]:
# Train TF-IDF model

print('TFIDF Vectorizer...')
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

TFIDF Vectorizer...


In [36]:
# create dict of tfidf matrices where key is business_id and value is the corresponding tfidf matrix
dict_tf_x_test = {k:vectorizer.transform(v['pre_process']) for k,v in dict_sentence_df.items()}
# create dict of sentences where key is business id and value is the corresponding list of sentences
dict_sentences = {k:v['pre_process'] for k,v in dict_sentence_df.items()}

In [12]:
# Making the mask + visualization

taste = vectorizer.transform(["taste flavor flavour consistent tasty reliable food"])
df_taste = pd.DataFrame(taste[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df_taste ['TF-IDF'] = [1 if x > 0 else 0 for x in df_taste['TF-IDF']]
df_taste = df_taste.sort_values('TF-IDF', ascending=False)
df_taste.head(15)

Unnamed: 0,TF-IDF
flavour,1
food,1
reliable,1
taste,1
consistent,1
tasty,1
flavor,1
pioneering,0
pintish,0
pinto,0


In [44]:
# mask for price

price = vectorizer.transform(["price bill check tab expensive cheap cost tab value worth"])
price_mask = pd.DataFrame(price[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
price_mask['TF-IDF'] = [1 if x > 0 else 0 for x in price_mask['TF-IDF']]
price_mask = price_mask.sort_values('TF-IDF', ascending=False)
price_mask.head(15)

Unnamed: 0,TF-IDF
cheap,1
cost,1
price,1
tab,1
expensive,1
check,1
value,1
bill,1
worth,1
pipe,0


In [15]:
# mask for sentence for testing/visualization purposes

df = pd.DataFrame(tf_x_test[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head(30)

Unnamed: 0,TF-IDF
soo,0.933086
good,0.359652
aa,0.0
pioneering,0.0
pinterest,0.0
pinthouse,0.0
pintish,0.0
pinto,0.0
pinwheel,0.0
pinxto,0.0


In [None]:
X_test[0]

In [48]:
# export masks, list of restaurant ids, and tfidf matrices
import pickle
with open("../script_data/masks.pkl", "wb") as file:
    mask_arr = np.array((price,taste))
    pickle.dump(mask_dict, file)
    
with open("../script_data/restaurant_tfidf_dict.pkl", "wb") as file:
    pickle.dump(dict_sentences, file)
    
with open("../script_data/restaurant_sentence_list_dict.pkl", "wb") as file:
    pickle.dump(dict_tf_x_test, file)