In [None]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install keyphrase-vectorizers
!{sys.executable} -m pip install keybert
!{sys.executable} -m pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [2]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import pathlib 
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

In [3]:
# data_dir = pathlib.Path('train_data.json')
data_dir = pathlib.Path('kaggle/yelp_training_set_review.json')

original_df = pd.read_json(data_dir)
print(original_df.shape[0], original_df.columns)
original_df = original_df[["business_id", "text", "user_id", "stars"]]

# Group by 'user_id' and aggregate the 'business_id' and 'stars'
user_df = original_df.groupby('user_id', as_index=False)[['business_id', 'stars']].agg(list).reset_index()

print("count in user df", user_df.shape[0])
print(user_df.head())

229907 Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
       'business_id'],
      dtype='object')
count in user df 45981
   index                 user_id  \
0      0  --5hzxWLz5ozIg6OMo6tpQ   
1      1  --65q1FpAL_UQtVZ2PTGew   
2      2  --AqBuo717RyacADExYbSA   
3      3  --LzFD0UDbYE-Oho3AhsOg   
4      4  --M-cIkGnH1KhnLaCOmoPQ   

                                         business_id  \
0                           [Z3n01OZqARDv06QgBneOZw]   
1  [r-a-Cn9hxdEnYTtVTB5bMQ, cOUS79i4vltKIc_hy4OZB...   
2                           [yTFdhqH9eLzaP05xXQa2rw]   
3   [yZbPvIa_7nt7ekxi31SkUA, r7GXuj4rr2vGPU_pklxcQg]   
4                           [oCA2OZcd_Jo_ggVmUx3WVw]   

                                               stars  
0                                                [5]  
1  [4, 5, 4, 4, 5, 3, 4, 3, 5, 4, 4, 2, 3, 5, 5, ...  
2                                                [5]  
3                                             [4, 3]  
4                      

In [4]:
#pick a user who has reviewed the most restaurants
users_with_most_ratings = user_df[user_df['business_id'].apply(len) > 570]
print(users_with_most_ratings)
#user with most ratings: fczQCSmaWF78toLEmb0Zsw


       index                 user_id  \
31258  31258  fczQCSmaWF78toLEmb0Zsw   

                                             business_id  \
31258  [c7VgGP8xT25OSReok6fwcQ, h8rqIokh6EkM4flR9Cjxb...   

                                                   stars  
31258  [4, 4, 4, 5, 4, 5, 5, 5, 5, 4, 4, 4, 1, 4, 4, ...  


In [100]:
user_filtered_businesses = users_with_most_ratings['business_id'].iloc[0]

#filter businesses out that have been reviewed by current user
filtered_df = original_df[original_df['business_id'].isin(user_filtered_businesses)]

# Group by 'business_id' and aggregate 'text' and 'stars'
business_df = filtered_df.groupby('business_id', as_index=True).agg({'text': ','.join}).reset_index()

# drop duplicate data 
business_df = business_df.drop_duplicates() 

print("count in business df", business_df.shape[0])
print("count in business df", business_df)

count in business df 588
count in business df                 business_id                                               text
0    -3WVw1TNQbPBzaKCaQQ1AQ  Had lunch here today after hearing all the col...
1    -4A5xmN21zi_TXnUESauUQ  Came here and did drive-though to pick up a tu...
2    -8wyZkzfBmCFkMwCGcR4PQ  So after coming here for the first time the ot...
3    -AAig9FG0s8gYE4f8GfowQ  Maestro Chris Bianco whips up addictive sandwi...
4    -KF9RQPkmIOHfE0tzUu9bg  What was Dunkin Donuts thinking when they took...
..                      ...                                                ...
583  zmFc8M-hS4uuyY0hklIpoQ  A definite favorite in the neighborhood.\n\nTh...
584  zruUQvFySeXyEd7_rQixBg  We arrived at US Egg on St. Patrick's Day at 1...
585  zw2iJahOnSxlzmRlF4al6g  Nice spot, with a friendly staff.  The sandwic...
586  zxqvU415r_RtZRKDtdbIKQ  REAL NY STYLE CHINESE FOOD HERE IN ARIZONA!!! ...
587  zzucnDH33KlmbkI4rLyQTA  I needed to fill in holes in my ski wardrobe--...

[588 

In [9]:
# extract restaurant reviews as an array, these reviews are by all users
restaurant_reviews=business_df['text']
restaurant_reviews.head()

0    Had lunch here today after hearing all the col...
1    Came here and did drive-though to pick up a tu...
2    So after coming here for the first time the ot...
3    Maestro Chris Bianco whips up addictive sandwi...
4    What was Dunkin Donuts thinking when they took...
Name: text, dtype: object

In [10]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(restaurant_reviews, show_progress_bar=True)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

In [11]:
print(embeddings)

[[ 0.04383374 -0.1057466   0.14801979 ...  0.15176639  0.22042583
  -0.05858767]
 [ 0.01372097 -0.38152567  0.5327189  ...  0.65647423  0.48541412
  -0.44912857]
 [ 0.36412263 -0.22436361  0.6352138  ...  0.2945958  -0.22073637
  -0.4541159 ]
 ...
 [ 0.0085351  -0.4191617   0.2085172  ...  0.5982428   0.27104366
  -0.25728327]
 [ 0.00707619  0.06160622  0.14750385 ... -0.04282828  0.12594736
  -0.4758514 ]
 [ 0.20037936 -0.08473375  0.11987599 ... -0.23582016 -0.2984317
  -0.37251252]]


In [46]:
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [128]:
cos_sim_data = pd.DataFrame(cosine_similarity(X))

def give_recommendations(index, print_recommendation=False):
    index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()

    # Filter the index list based on cosine similarity > 0.5
    filtered_index_recomm = [i for i in index_recomm if cos_sim_data.loc[index, i] > 0.5]

    # Print the filtered index list and corresponding cosine similarity values
    filtered_cos_sim_values = cos_sim_data.loc[index, filtered_index_recomm].sort_values(ascending=False)
    bus_recomm = business_df['business_id'].loc[filtered_index_recomm].values
    
    # Create a DataFrame with the results
    result_df = pd.DataFrame({
        'business': bus_recomm,
        'filtered_index': filtered_index_recomm,
        'filtered_cos_sim_values': filtered_cos_sim_values.values
    })

    if print_recommendation:
        print(result_df)

    return result_df

In [130]:
# pick a random restaurant to predict rating by the user for
res = give_recommendations(2,True)

                   business  filtered_index  filtered_cos_sim_values
0    -8wyZkzfBmCFkMwCGcR4PQ               2                 1.000000
1    5-X03Zc0nN7U5eoe8uFUdw              54                 0.890782
2    mxrXVZWc6PWk81gvOVNOUw             468                 0.888999
3    755-2byE2zGMoMBR95UABg              73                 0.885118
4    sYufscrC4xb7JfI4Kdp2cg             510                 0.883153
..                      ...             ...                      ...
583  HW_QFzCZfF4yqN64F5Xx_Q             183                 0.567915
584  gxuVySgACHDqJlwmelFHLA             416                 0.556604
585  3F7Im__Cs1v2wTZhB2yo0A              37                 0.532934
586  _1QQZuf4zZOyFCvXc0o6Vg             351                 0.531611
587  rk4evtzXiaHF2W8U0eX3TQ             507                 0.502597

[588 rows x 3 columns]


In [131]:
#load details about businesses
business_detail_df = pd.read_json('kaggle/yelp_training_set_business.json',orient='records',lines=True)
business_detail_df.head()

Unnamed: 0,business_id,full_address,open,categories,city,review_count,name,neighborhoods,longitude,state,stars,latitude,type
0,rncjoVoEFUJGCUoC1JgnUA,"8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345",True,"[Accountants, Professional Services, Tax Servi...",Peoria,3,Peoria Income Tax Service,[],-112.241596,AZ,5.0,33.581867,business
1,0FNFSzCFP_rGUoJx8W7tJg,"2149 W Wood Dr\nPhoenix, AZ 85029",True,"[Sporting Goods, Bikes, Shopping]",Phoenix,5,Bike Doctor,[],-112.105933,AZ,5.0,33.604054,business
2,3f_lyB6vFK48ukH6ScvLHg,"1134 N Central Ave\nPhoenix, AZ 85004",True,[],Phoenix,4,Valley Permaculture Alliance,[],-112.073933,AZ,5.0,33.460526,business
3,usAsSV36QmUej8--yvN-dg,"845 W Southern Ave\nPhoenix, AZ 85041",True,"[Food, Grocery]",Phoenix,5,Food City,[],-112.085377,AZ,3.5,33.39221,business
4,PzOqRohWw7F7YEPBz6AubA,"6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...",True,"[Food, Bagels, Delis, Restaurants]",Glendale Az,14,Hot Bagels & Deli,[],-112.200264,AZ,3.5,33.712797,business


In [134]:
input_business_id = business_df['business_id'].loc[index]
input_business_name = business_detail_df.loc[(business_detail_df['business_id'] == input_business_id)][["name"]].iloc[0]['name']
print("Input restaturant is: ", input_business_name)

#get names of recommemded movies
# recommended_business = res['Businesses']
# business_names = []
# for bus in recommended_business:
#     rslt_df = business_detail_df.loc[(business_detail_df['business_id'] == bus)][["name"]]
#     business_names.append(rslt_df.iloc[0]['name'])
    
# print("Most similar restaturants are: ",business_names)
            

Input restaturant is:  Poppa Maize


In [None]:
# Now let's figure out how the user would rate this restaurant 
# based on it's most similar restaturants that it has already reviewed and rated
# we are assuming the cosine similarity is linear
print(res)
# find stars given by user
user_ratings = users_with_most_ratings['stars'].iloc[0]
print(user_ratings)
