## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [37]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import pycmf

from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset

import matplotlib.pyplot as plt
import tarfile
import json
from tqdm import tqdm

In [24]:
# load business.json
# 192609 unique businesses?
line_count = len(open("./yelp_dataset/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts, attributes, categories = [], [], [], [], [], [], [], [], []
with open("./yelp_dataset/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        attributes += [blob["attributes"]]
        categories += [blob["categories"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts, "attributes": attributes, "categories":categories }
)

100%|██████████| 192609/192609 [00:02<00:00, 72760.51it/s]


In [26]:
# load user.json
# 1637138 unique users?
line_count = len(open("./yelp_dataset/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("./yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:20<00:00, 79512.69it/s] 


In [25]:
# load review.json
# 6685900 unique reviews?
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, texts = [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
        texts += [blob["text"]]
reviews = pd.DataFrame(
    {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text": texts}
)
user_counts = reviews["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()
reviews = reviews.loc[reviews.user_id.isin(active_users)]

100%|██████████| 6685900/6685900 [00:55<00:00, 120458.76it/s]


In [22]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

## Baseline: SVD as Matrix Factorization

As a CF algorithm. A matrix factorization technique that reduces the number of features of a data set by reducing space dimensions from N to K where K < N. Thus, in our context, we are finding 2 matrices whose product is the original matrix. 

### Loading and Transforming


In [72]:
# tunning sets
ratings_holdout_20 = pd.read_csv('data/ratings_sample_holdout_20.csv')
ratings_train_20 = pd.read_csv('data/ratings_sample_train_20.csv')
ratings_val_20 = pd.read_csv('data/ratings_sample_cv_20.csv')

ratings_holdout_50 = pd.read_csv('data/ratings_sample_holdout_50.csv')
ratings_val_50 = pd.read_csv('data/ratings_sample_val_50.csv')
ratings_train_50 = pd.read_csv('data/ratings_sample_train_50.csv')

ratings_holdout = pd.read_csv('data/ratings_sample_holdout.csv')
ratings_train = pd.read_csv('data/ratings_sample_train.csv')
ratings_val = pd.read_csv('data/ratings_sample_cv.csv')

In [64]:
ratings_train_20 = process(ratings_train_20.copy())
ratings_holdout_20 = process(ratings_holdout_20.copy())
ratings_val_20 = process(ratings_val_20.copy())

ratings_train_50 = process(ratings_train_50.copy())
ratings_holdout_50 = process(ratings_holdout_50.copy())
ratings_val_50 = process(ratings_val_50.copy())

ratings_val = process(ratings_val.copy())
ratings_train = process(ratings_train.copy())
ratings_holdout = process(ratings_holdout.copy())

In [66]:
ratings_test_20 = ratings_holdout_20.loc[ratings_holdout_20.business_id.isin(ratings_train_20.business_id)]
ratings_val_20 = ratings_val_20.loc[ratings_val_20.business_id.isin(ratings_train_20.business_id)]

ratings_test_50 = ratings_holdout_50.loc[ratings_holdout_50.business_id.isin(ratings_train_50.business_id)]
ratings_val_50 = ratings_val_50.loc[ratings_val_50.business_id.isin(ratings_train_50.business_id)]

ratings_test = ratings_holdout.loc[ratings_holdout.business_id.isin(ratings_train.business_id)]
ratings_val = ratings_val.loc[ratings_val.business_id.isin(ratings_train.business_id)]

In [68]:
trainset_20 = ratings_train_20.iloc[:,0:3]
trainset_20.columns = ['userID', 'itemID','rating']
valset_20 = ratings_val_20.iloc[:, 0:3]
valset_20.columns = ['userID', 'itemID','rating']
testset_20 = ratings_holdout_20.iloc[:, 0:3]
testset_20.columns = ['userID', 'itemID','rating']

trainset_50 = ratings_train_50.iloc[:,0:3]
trainset_50.columns = ['userID', 'itemID','rating']
valset_50 = ratings_val_50.iloc[:, 0:3]
valset_50.columns = ['userID', 'itemID','rating']
testset_50 = ratings_holdout_50.iloc[:, 0:3]
testset_50.columns = ['userID', 'itemID','rating']

trainset = ratings_train.iloc[:,0:3]
trainset.columns = ['userID', 'itemID','rating']
valset = ratings_val.iloc[:, 0:3]
valset.columns = ['userID', 'itemID','rating']
testset = ratings_holdout.iloc[:, 0:3]
testset.columns = ['userID', 'itemID','rating']

In [None]:
# transform to work with surprise, 20%, 50% and full dataset
reader = Reader(rating_scale = (0.0, 5.0))
train_data_20 = Dataset.load_from_df(trainset_20[['userID','itemID','rating']], reader)
val_data_20 = Dataset.load_from_df(valset_20[['userID','itemID','rating']], reader)
test_data_20 = Dataset.load_from_df(testset_20[['userID','itemID','rating']], reader)

train_data_50 = Dataset.load_from_df(trainset_50[['userID','itemID','rating']], reader)
val_data_50 = Dataset.load_from_df(valset_50[['userID','itemID','rating']], reader)
test_data_50 = Dataset.load_from_df(testset_50[['userID','itemID','rating']], reader)

train_data = Dataset.load_from_df(trainset[['userID','itemID','rating']], reader)
val_data = Dataset.load_from_df(valset[['userID','itemID','rating']], reader)
test_data = Dataset.load_from_df(testset[['userID','itemID','rating']], reader)

In [None]:
train_sr_20 = train_data_20.build_full_trainset()
val_sr_before_20 = val_data_20.build_full_trainset()
val_sr_20 = val_sr_before_20.build_testset()
test_sr_before_20 = test_data_20.build_full_trainset()
test_sr_20 = test_sr_before_20.build_testset()

train_sr_50 = train_data_50.build_full_trainset()
val_sr_before_50 = val_data_20.build_full_trainset()
val_sr_50 = val_sr_before_20.build_testset()
test_sr_before_50 = test_data_50.build_full_trainset()
test_sr_50 = test_sr_before_50.build_testset()

train_sr = train_data.build_full_trainset()
val_sr_before = val_data.build_full_trainset()
val_sr = val_sr_before.build_testset()
test_sr_before = test_data.build_full_trainset()
test_sr = test_sr_before.build_testset()

## Tuning

In [56]:
RMSE_tune = {}

In [51]:
n_epochs = [5, 7, 10]  # the number of iteration of the SGD procedure
lr_all = [0.002, 0.003, 0.005] # the learning rate for all parameters
reg_all =  [0.4, 0.5, 0.6] # the regularization term for all parameters

In [57]:
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr_20)
            predictions = algo.test(val_sr_20)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)

RMSE: 1.4159
RMSE: 1.4173
RMSE: 1.4174
RMSE: 1.4012
RMSE: 1.4016
RMSE: 1.4032
RMSE: 1.3798
RMSE: 1.3807
RMSE: 1.3821
RMSE: 1.4042
RMSE: 1.4047
RMSE: 1.4058
RMSE: 1.3873
RMSE: 1.3885
RMSE: 1.3900
RMSE: 1.3650
RMSE: 1.3665
RMSE: 1.3672
RMSE: 1.3895
RMSE: 1.3903
RMSE: 1.3918
RMSE: 1.3714
RMSE: 1.3735
RMSE: 1.3747
RMSE: 1.3497
RMSE: 1.3508
RMSE: 1.3527


In [58]:
RMSE_tune

{(5, 0.002, 0.4): 1.415933676012248,
 (5, 0.002, 0.5): 1.4173365973852405,
 (5, 0.002, 0.6): 1.417370198820145,
 (5, 0.003, 0.4): 1.4011758726577428,
 (5, 0.003, 0.5): 1.4016120313912452,
 (5, 0.003, 0.6): 1.4031618552800484,
 (5, 0.005, 0.4): 1.3798484285012471,
 (5, 0.005, 0.5): 1.3806636393067282,
 (5, 0.005, 0.6): 1.3821350022120953,
 (7, 0.002, 0.4): 1.4042009585091328,
 (7, 0.002, 0.5): 1.4047180140602467,
 (7, 0.002, 0.6): 1.405780300835095,
 (7, 0.003, 0.4): 1.3872838266494933,
 (7, 0.003, 0.5): 1.3885483890855905,
 (7, 0.003, 0.6): 1.3900448377490269,
 (7, 0.005, 0.4): 1.3649676001324043,
 (7, 0.005, 0.5): 1.3665228974538999,
 (7, 0.005, 0.6): 1.3671758260897764,
 (10, 0.002, 0.4): 1.3894940834924787,
 (10, 0.002, 0.5): 1.3903377177304177,
 (10, 0.002, 0.6): 1.3917867965697626,
 (10, 0.003, 0.4): 1.3714281812046991,
 (10, 0.003, 0.5): 1.3734950396997587,
 (10, 0.003, 0.6): 1.3746840315086657,
 (10, 0.005, 0.4): 1.3497144262970868,
 (10, 0.005, 0.5): 1.3507892432826967,
 (10, 0

## Results

In [None]:
%% time
# so the best is when n_epochs = 10, lr_all = 0.005, reg_all = 0.5,
# and the RMSE score is 1.3497
# train and test on the optimal parameter
start_time = time.time()
algo_real = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.5)
algo.fit(train_sr_20)
predictions = algo.test(test_sr_20)
accuracy.rmse(predictions)

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
%% time
# so the best is when n_epochs = 10, lr_all = 0.005, reg_all = 0.5,
# and the RMSE score is 1.3497
# train and test on the optimal parameter
start_time = time.time()
algo_real = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.5)
algo.fit(train_sr_50)
predictions = algo.test(test_sr_50)
accuracy.rmse(predictions)

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [60]:
%% time
# so the best is when n_epochs = 10, lr_all = 0.005, reg_all = 0.5,
# and the RMSE score is 1.3497
# train and test on the optimal parameter
start_time = time.time()
algo_real = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.5)
algo.fit(train_sr)
predictions = algo.test(test_sr)
accuracy.rmse(predictions)

RMSE: 1.4168


1.4167680096618858

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))