In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from surprise import Dataset, Reader, NormalPredictor, KNNBasic, KNNWithZScore, KNNWithMeans, KNNWithZScore, SVD
from surprise.model_selection import train_test_split, cross_validate
from collections import defaultdict
from surprise import accuracy

In [2]:
df_dict = {}

for i in ['business', 'checkin', 'review', 'tip', 'user']:
    data = []
    with open(f'yelp/{i}.json', 'r') as file:
        for line in file:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    df_dict[f'df_{i}'] = df

In [3]:
df_business = df_dict['df_business']
df_user = df_dict['df_user']
df_review = df_dict['df_review']

In [4]:
df_review

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5.0,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5.0,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [6]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [5]:
print(f'Number of users: {df_review["user_id"].nunique()}')
print(f'Number of businesses: {df_review["business_id"].nunique()}')

Number of users: 1987929
Number of businesses: 150346


In [44]:
# Too big to be in matrix format (~300B cells)
df_pivot = df_review.pivot_table(index='user_id', columns='business_id', values='stars', aggfunc='mean')

  num_cells = num_rows * num_columns


ValueError: negative dimensions are not allowed

In [8]:
# Filtering the data just for Restaurants in Nashville
df_nashville = df_review.merge(df_business, how='left', on='business_id')
df_nashville = df_nashville[(df_nashville['city'] == 'Nashville') & (df_nashville['categories'].str.contains('Restaurants', na=False))]
df_nashville = df_nashville[['user_id', 'business_id', 'stars_x']].rename(columns={'stars_x':'stars'})
df_nashville

Unnamed: 0,user_id,business_id,stars
29,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2.0
39,iYY5Ii1LGpZCpXFkHlMefw,Zx7n8mdt8OzLRXVzolXNhQ,5.0
43,RRTQpg8hutdimzAYuP_Hbw,eaJCpC6IhYphj7bwCDHTwQ,5.0
63,8YSxVaD203mE_9FR4nCEVA,oQ5CPRt0R3AzFvcjNOqB1w,3.0
82,qVYilGRmY6uemAy_tDpfuA,9OtFX3YzG5keaUMOWU1imA,5.0
...,...,...,...
6990129,ek0l1feRZO1PU4uzR1oBWA,yX0P0_JV1imBE46I1ZMEaA,2.0
6990146,lrO6hCPYV-j3IAxNSMCW2g,1b5mnK8bMnnju_cvU65GqQ,4.0
6990193,yVO4zAHlXDKN5oXiM-iI0w,yZHiutcbym-xLGkmFhP-9A,1.0
6990253,XJTO9x78TgWE94cmXqNduA,c3QxX3toWdqJnKQmmIliRQ,1.0


In [9]:
# Filtering out users with less than 10 ratings
user_counts = df_nashville['user_id'].value_counts()
user_to_keep = user_counts[user_counts >= 10].index
df_nashville = df_nashville[df_nashville['user_id'].isin(user_to_keep)]
df_nashville

Unnamed: 0,user_id,business_id,stars
63,8YSxVaD203mE_9FR4nCEVA,oQ5CPRt0R3AzFvcjNOqB1w,3.0
101,YjS6MDNwGbueb5WtALIJ2A,EBn3U4mpnIRLIy2lKuilRQ,1.0
115,X8XCFMZN8pFlWEZcKuKzZw,Zx7n8mdt8OzLRXVzolXNhQ,5.0
125,TTibuRAx2gxu-nVAymFijQ,-ikBycdroyTLDBHR9aC3HA,5.0
221,iwxNSCgu-lEXtLSdPI4EyA,EDjEVzmoQVHzboFqC-M6Ew,5.0
...,...,...,...
6989585,oVlVn5B9H16LHgdASH2PEg,5quqExjLtFNhykec_na8pg,5.0
6989904,ceFHavITrot4UAW5DG54tA,KQjND7KiZi3Bspb3CvhHzQ,5.0
6989941,148fr6_3lmBL13o8wJAEPw,M83ABiFSIE2Zw_ylFRgThA,4.0
6990019,ajzKfSd6cY0L3k-TD3wTeA,qY4YoWlEn8zB1Ho-DOTQVw,5.0


In [10]:
df_pivot = df_nashville.pivot_table(index='user_id', columns='business_id', values='stars', aggfunc='mean')
df_pivot

business_id,-1EGqUQFBmGEp76CE-Zk4Q,-3Xl8nSBSjaPpftsSNyrgg,-6nqH2tdNHQdqb8LvILlBw,-7CqBmK0zxwyjEqa3LBVSw,-951Q5if-JmCQbO40meVPw,-CLHHZzNmv1DYqyNQmlNtA,-C_5RtXY4iIoztzIYVqMSQ,-GvA6iekqOACGodgLYiMKA,-KNgMklrxgRX2-T6xXeTZA,-Nf2K8OmZ3OaApWamQHrbw,...,zqxmjaFnBxvRw-IDrEgPSw,zr3z36d9rRX68wKcHstOmA,ztI4UzV6YRNj2mWWr9xtzQ,zuP6BplIz4y6QGuq7Ftm1g,zwGzwkVeYXE-tRisb8if7A,zwTeKW4iQgjBklEi-kaKpQ,zxpJ4Frr8Zoru-YaNzOxzw,zyHP-oXgDkANEyQbJVKf8g,zyPz0xvE5XkeCVH2J7g2gw,zzfj1-iPfw0cwnOjY0yUgA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0H6Rm6dCi3pkFBC26HsoQ,,,,,,,,,,,...,,,,,,,,,,
-1awBy86Qgr3aN30_Z3xGw,,,,,,,,,,,...,,,,,,,,,,
-2jsGRpMtCv78L1pmEJoMQ,,,,,,,,,,,...,,,,,,,,,,
-3HYmxW_5Gsg4I0eH3ju-Q,,,,,,,,,,,...,,,,,,,,,,
-41S-0Fhh40dYKRyqCzyVQ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv7KFh3D5eQZc7Qi7dz5FA,,,,,,,,,,,...,,,,,,,,,,
zwOIf5QYQ-NgzkVuHqf0Dg,,,,,,,,,,,...,,,,,,,,4.0,,
zxQgPWnF1BptfO0hBzJtBg,,,,,,,,,,,...,,,,,,,,,,
zys3i72fm0-Hg1SfcpKTsA,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Creating dataset from the df_review
reader = Reader(rating_scale=(df_nashville.stars.min(), df_nashville.stars.max()))
data = Dataset.load_from_df(df_nashville[['user_id', 'business_id', 'stars']], reader)

# Train/test split
trainset, testset = train_test_split(data, test_size=0.2)

In [12]:
# Define evaluation function
def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    # Compute and return RMSE
    rmse = accuracy.rmse(predictions)
    return rmse

In [13]:
# Random Recommender
random_algo = NormalPredictor()
random_rmse = evaluate_algorithm(random_algo, trainset, testset)

RMSE: 1.5264


In [14]:
# User-Based Collaborative Filtering
ubcf_algo = KNNBasic(sim_options={'user_based': True})
ubcf_rmse = evaluate_algorithm(ubcf_algo, trainset, testset)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1111


In [15]:
# Item-Based Collaborative Filtering
ibcf_algo = KNNBasic(sim_options={'user_based': False})
ibcf_rmse = evaluate_algorithm(ibcf_algo, trainset, testset)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1102


In [16]:
# Singular Value Decomposition (SVD)
svd_algo = SVD()
svd_rmse = evaluate_algorithm(svd_algo, trainset, testset)

RMSE: 1.0344


In [17]:
print(f"Random RMSE: {random_rmse:.3f}")
print(f"User-Based CF RMSE: {ubcf_rmse:.3f}")
print(f"Item-Based CF RMSE: {ibcf_rmse:.3f}")
print(f"SVD RMSE: {svd_rmse:.3f}")

Random RMSE: 1.526
User-Based CF RMSE: 1.111
Item-Based CF RMSE: 1.110
SVD RMSE: 1.034
