## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [175]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import pycmf

import time
from copy import deepcopy

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import BaselineOnly

import matplotlib.pyplot as plt
import tarfile
import json
from tqdm import tqdm

In [2]:
# load business.json
# 192609 unique businesses?
line_count = len(open("./yelp_dataset/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts, attributes, categories = [], [], [], [], [], [], [], [], []
with open("./yelp_dataset/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        attributes += [blob["attributes"]]
        categories += [blob["categories"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts, "attributes": attributes, "categories":categories }
)

100%|██████████| 192609/192609 [00:02<00:00, 72168.99it/s]


In [3]:
# load user.json
# 1637138 unique users?
line_count = len(open("./yelp_dataset/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("./yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:21<00:00, 75685.86it/s]


In [4]:
# load review.json
# 6685900 unique reviews?
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, texts = [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
        texts += [blob["text"]]
reviews = pd.DataFrame(
    {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text": texts}
)
user_counts = reviews["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()
reviews = reviews.loc[reviews.user_id.isin(active_users)]

100%|██████████| 6685900/6685900 [00:58<00:00, 113366.86it/s]


In [5]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

## Loading Data: 20%, 50%, 100%


In [122]:
ratings_holdout_20 = pd.read_csv('data/ratings_sample_holdout_20.csv')
ratings_train_20 = pd.read_csv('data/ratings_sample_train_20.csv')
ratings_val_20 = pd.read_csv('data/ratings_sample_cv_20.csv')

ratings_holdout_50 = pd.read_csv('data/ratings_sample_holdout_50.csv')
ratings_val_50 = pd.read_csv('data/ratings_sample_cv_50.csv')
ratings_train_50 = pd.read_csv('data/ratings_sample_train_50.csv')

ratings_holdout_100 = pd.read_csv('data/ratings_sample_holdout_100.csv')
ratings_train_100 = pd.read_csv('data/ratings_sample_train_100.csv')
ratings_val_100 = pd.read_csv('data/ratings_sample_cv_100.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
ratings_train_20 = process(ratings_train_20.copy())
ratings_holdout_20 = process(ratings_holdout_20.copy())
ratings_val_20 = process(ratings_val_20.copy())

ratings_train_50 = process(ratings_train_50.copy())
ratings_holdout_50 = process(ratings_holdout_50.copy())
ratings_val_50 = process(ratings_val_50.copy())

ratings_val_100 = process(ratings_val_100.copy())
ratings_train_100 = process(ratings_train_100.copy())
ratings_holdout_100 = process(ratings_holdout_100.copy())

In [124]:
ratings_test_20 = ratings_holdout_20.loc[ratings_holdout_20.business_id.isin(ratings_train_20.business_id)]
ratings_val_20 = ratings_val_20.loc[ratings_val_20.business_id.isin(ratings_train_20.business_id)]

ratings_test_50 = ratings_holdout_50.loc[ratings_holdout_50.business_id.isin(ratings_train_50.business_id)]
ratings_val_50 = ratings_val_50.loc[ratings_val_50.business_id.isin(ratings_train_50.business_id)]

ratings_test_100 = ratings_holdout_100.loc[ratings_holdout_100.business_id.isin(ratings_train_100.business_id)]
ratings_val_100 = ratings_val_100.loc[ratings_val_100.business_id.isin(ratings_train_100.business_id)]

In [128]:
trainset_20 = ratings_train_20.iloc[:,0:3]
trainset_20.columns = ['userID', 'itemID','rating']
valset_20 = ratings_val_20.iloc[:, 0:3]
valset_20.columns = ['userID', 'itemID','rating']
testset_20 = ratings_holdout_20.iloc[:, 0:3]
testset_20.columns = ['userID', 'itemID','rating']

trainset_50 = ratings_train_50.iloc[:,0:3]
trainset_50.columns = ['userID', 'itemID','rating']
valset_50 = ratings_val_50.iloc[:, 0:3]
valset_50.columns = ['userID', 'itemID','rating']
testset_50 = ratings_holdout_50.iloc[:, 0:3]
testset_50.columns = ['userID', 'itemID','rating']

trainset_100 = ratings_train_100.iloc[:,0:3]
trainset_100.columns = ['userID', 'itemID','rating']
valset_100 = ratings_val_100.iloc[:, 0:3]
valset_100.columns = ['userID', 'itemID','rating']
testset_100 = ratings_holdout_100.iloc[:, 0:3]
testset_100.columns = ['userID', 'itemID','rating']

In [None]:
# transform to work with surprise, 20%, 50% and full dataset
reader = Reader(rating_scale = (0.0, 5.0))
train_data_20 = Dataset.load_from_df(trainset_20[['userID','itemID','rating']], reader)
val_data_20 = Dataset.load_from_df(valset_20[['userID','itemID','rating']], reader)
test_data_20 = Dataset.load_from_df(testset_20[['userID','itemID','rating']], reader)

train_data_50 = Dataset.load_from_df(trainset_50[['userID','itemID','rating']], reader)
val_data_50 = Dataset.load_from_df(valset_50[['userID','itemID','rating']], reader)
test_data_50 = Dataset.load_from_df(testset_50[['userID','itemID','rating']], reader)

train_data_100 = Dataset.load_from_df(trainset_100[['userID','itemID','rating']], reader)
val_data_100 = Dataset.load_from_df(valset_100[['userID','itemID','rating']], reader)
test_data_100 = Dataset.load_from_df(testset_100[['userID','itemID','rating']], reader)

In [None]:
train_sr_20 = train_data_20.build_full_trainset()
val_sr_before_20 = val_data_20.build_full_trainset()
val_sr_20 = val_sr_before_20.build_testset()
test_sr_before_20 = test_data_20.build_full_trainset()
test_sr_20 = test_sr_before_20.build_testset()

train_sr_50 = train_data_50.build_full_trainset()
val_sr_before_50 = val_data_20.build_full_trainset()
val_sr_50 = val_sr_before_20.build_testset()
test_sr_before_50 = test_data_50.build_full_trainset()
test_sr_50 = test_sr_before_50.build_testset()

train_sr_100 = train_data_100.build_full_trainset()
val_sr_before_100 = val_data_100.build_full_trainset()
val_sr_100 = val_sr_before_100.build_testset()
test_sr_before_100 = test_data_100.build_full_trainset()
test_sr_100 = test_sr_before_100.build_testset()

#### evaluation set up

In [13]:
ratings_train_final_20 = ratings_train_20.append(ratings_val_20)
ratings_train_final_50 = ratings_train_50.append(ratings_val_50)
ratings_train_final_100 = ratings_train_100.append(ratings_val_100)

In [14]:
ratings_entire_df_20 = ratings_train_20.append(ratings_val_20).append(ratings_holdout_20)
ratings_entire_df_50 = ratings_train_50.append(ratings_val_50).append(ratings_holdout_50)
ratings_entire_df_100 = ratings_train_100.append(ratings_val_100).append(ratings_holdout_100)

In [18]:
unique_city_businesses_20 = ratings_entire_df_20[['city','business_id']].drop_duplicates()
unique_cities_20 = unique_city_businesses_20.groupby('city').count()['business_id']
unique_cities_20 = unique_cities_20[unique_cities_20 > 100]
out_20 = pd.DataFrame()
for city in unique_cities_20.index:
    tmp = ratings_holdout_20[(ratings_holdout_20['city'] ==city) &
                              (ratings_holdout_20['rating'] >ratings_holdout_20['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_20 = out_20.append(row)
        
unique_city_businesses_50 = ratings_entire_df_50[['city','business_id']].drop_duplicates()
unique_cities_50 = unique_city_businesses_50.groupby('city').count()['business_id']
unique_cities_50 = unique_cities_50[unique_cities_50 > 100]
out_50 = pd.DataFrame()
for city in unique_cities_50.index:
    tmp = ratings_holdout_50[(ratings_holdout_50['city'] ==city) &
                              (ratings_holdout_50['rating'] >ratings_holdout_50['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_50 = out_50.append(row)
        
unique_city_businesses_100 = ratings_entire_df_100[['city','business_id']].drop_duplicates()
unique_cities_100 = unique_city_businesses_100.groupby('city').count()['business_id']
unique_cities_100 = unique_cities_100[unique_cities_100 > 100]
out_100 = pd.DataFrame()
for city in unique_cities_100.index:
    tmp = ratings_holdout_100[(ratings_holdout_100['city'] ==city) &
                              (ratings_holdout_100['rating'] >ratings_holdout_100['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_100 = out_100.append(row)

In [19]:
predict_df_20 = out_20[['user_id','city','state']]
predict_df_20 = predict_df_20.merge(unique_city_businesses_20, on = 'city')
predict_df_20['predictions'] = 25

predict_df_50 = out_50[['user_id','city','state']]
predict_df_50 = predict_df_50.merge(unique_city_businesses_50, on = 'city')
predict_df_50['predictions'] = 25

predict_df_100 = out_50[['user_id','city','state']]
predict_df_100 = predict_df_100.merge(unique_city_businesses_100, on = 'city')
predict_df_100['predictions'] = 25

In [20]:
eval_20 = Dataset.load_from_df(predict_df_20[['user_id','business_id','predictions']], reader)
eval_50 = Dataset.load_from_df(predict_df_50[['user_id','business_id','predictions']], reader)
eval_100 = Dataset.load_from_df(predict_df_100[['user_id','business_id','predictions']], reader)

In [None]:
eval_before_20 = eval_20.build_full_trainset()
eval_sr_20 = eval_before_20.build_testset()
eval_pred_20 = algo.test(eval_sr_20)
#accuracy.rmse(eval_pred_20)
baseline_20 = pd.DataFrame(eval_pred_20, columns = ['userId','itemId','rating','pred_rating','x'])
predict_df_20['predictions'] = baseline_20.pred_rating

## Bias Baseline

$\sum_{r_{ui} \in R_{train}} \left(r_{ui} - (\mu + b_u + b_i)\right)^2 +
\lambda \left(b_u^2 + b_i^2 \right)$.

### Hyperparameter Tuning

In [197]:
bsl_options = {'method': 'als', 'n_epochs':3}
bias_baseline = BaselineOnly(bsl_options)
algo.fit(train_sr_20)
predictions = algo.test(val_sr_20)
accuracy.rmse(predictions)

RMSE: 1.3492


1.3491711762452891

In [198]:
bsl_options = {'method': 'als', 'n_epochs':5}
bias_baseline = BaselineOnly(bsl_options)
algo.fit(train_sr_20)
predictions = algo.test(val_sr_20)
accuracy.rmse(predictions)

RMSE: 1.3492


1.3491711762452891

In [199]:
bsl_options = {'method': 'als', 'n_epochs':7}
bias_baseline = BaselineOnly(bsl_options)
algo.fit(train_sr_20)
predictions = algo.test(val_sr_20)
accuracy.rmse(predictions)

RMSE: 1.3492


1.3491711762452891

In [200]:
bsl_options = {'method': 'als', 'n_epochs':9}
bias_baseline = BaselineOnly(bsl_options)
algo.fit(train_sr_20)
predictions = algo.test(val_sr_20)
accuracy.rmse(predictions)

RMSE: 1.3492


1.3491711762452891

### Thus, they are all the same; we will just use default

### Results

In [180]:
# 20%
start_time = time.time()
bias_baseline.fit(train_sr_20)
print("--- %s seconds ---" % (time.time() - start_time))

Estimating biases using als...
--- 2.0564420223236084 seconds ---


In [188]:
# 20%
bbase_p = bias_baseline.test(test_sr_20)
start_time = time.time()
bbase_20_df = pd.DataFrame(bbase_p, columns = ['userId','itemId','rating','pred_rating','x'])
accuracy.rmse(bbase_p)
print('R^2 (with 20% data): ', r2_score(bbase_20_df.rating , bbase_20_df.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(bbase_20_df.rating, bbase_20_df.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE: 1.3545
R^2 (with 20% data):  0.19799189125456051
MAE (with 20% data):  1.127068744947832
--- 0.08617496490478516 seconds ---


In [186]:
# 50%
start_time = time.time()
bias_baseline.fit(train_sr_50)
print("--- %s seconds ---" % (time.time() - start_time))

Estimating biases using als...
--- 9.076530933380127 seconds ---


In [189]:
bbase_p_50 = bias_baseline.test(test_sr_50)
start_time = time.time()
bbase_50_df = pd.DataFrame(bbase_p_50, columns = ['userId','itemId','rating','pred_rating','x'])
accuracy.rmse(bbase_p_50)
print('R^2 (with 20% data): ', r2_score(bbase_50_df.rating , bbase_50_df.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(bbase_50_df.rating, bbase_50_df.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE: 1.3607
R^2 (with 20% data):  0.19814212228616024
MAE (with 20% data):  1.133596632127378
--- 0.2215869426727295 seconds ---


In [191]:
# 100%
start_time = time.time()
bias_baseline.fit(train_sr_100)
print("--- %s seconds ---" % (time.time() - start_time))

Estimating biases using als...
--- 14.368994951248169 seconds ---


In [192]:
bbase_p_100 = bias_baseline.test(test_sr_100)
start_time = time.time()
bbase_100_df = pd.DataFrame(bbase_p_100, columns = ['userId','itemId','rating','pred_rating','x'])
accuracy.rmse(bbase_p_100)
print('R^2 (with 20% data): ', r2_score(bbase_100_df.rating , bbase_100_df.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(bbase_100_df.rating, bbase_100_df.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE: 1.4015
R^2 (with 20% data):  0.1507091690943303
MAE (with 20% data):  1.1735826826580384
--- 0.412855863571167 seconds ---


### Evaluate

In [204]:
algo = BaselineOnly()
eval_before_50 = eval_20.build_full_trainset()
eval_sr_20 = eval_before_20.build_testset()
algo.fit(train_sr_20)
eval_pred_20 = algo.test(eval_sr_20)
#accuracy.rmse(predictions_50)
baseline_20 = pd.DataFrame(eval_pred_20, columns = ['userId','itemId','rating','pred_rating','x'])

algo = BaselineOnly()
eval_before_50 = eval_50.build_full_trainset()
eval_sr_50 = eval_before_50.build_testset()
algo.fit(train_sr_50)
eval_pred_50 = algo.test(eval_sr_50)
#accuracy.rmse(predictions_50)
baseline_50 = pd.DataFrame(eval_pred_50, columns = ['userId','itemId','rating','pred_rating','x'])

algo = BaselineOnly()
eval_before_100 = eval_100.build_full_trainset()
eval_sr_100 = eval_before_100.build_testset()
algo.fit(train_sr_100)
eval_pred_100 = algo.test(eval_sr_100)
#accuracy.rmse(predictions_100)
baseline_100 = pd.DataFrame(eval_pred_100, columns = ['userId','itemId','rating','pred_rating','x'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [205]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)

0.10714285714285714 0.503095238095238 0.9697619047619042 528.3333333333334


In [206]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

0.18035714285714285 0.20553571428571435 0.9794642857142847 440.3589285714286


In [212]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

0.014516129032258065 0.2 0.9983870967741933 6.670967741935484


## Matrix Factorization (Baseline)

Matrix factorization is a class of collaborative filtering algorithms. The general idea behind matrix factorization is that there can exist a lower dimensional latent space of features in which users and items can be represented such that the interaction between them can be obtained by simply dot producing the corresponding dense vectors in that space. In short, it decomposes a m*n user-item interaction matrix into two m*k and k*n matrices, sharing a joint latent vector space, where m represents the number of users, and n represents the number of items. In terms of its outcome, we are likely to observe that close users in terms of preferences as well as close items in terms of characteristics can have close representations in the latent space.

The mathematical overview is as follows:
Given a n*m matrix, such that . X is the user matrix where rows represent the n users and Y is the item matrix where rows represent the m items. We want to search for the dot product of matrices X and Y that best approximate the existing interactions; i.e., we want to find X and Y that minimize the “rating reconstruction error”:

$$ (X,Y) = argmin_{X,Y} \sum_{(i,j) \in E} [(X_i)(Y_j)^T − M_{ij}]^2$$

Adding a regularization term, we can also get:

$$(X,Y) = argmin_{X,Y} ½ \sum_{(i,j) \in E} [(X_i)(Y_j)^T − M_{ij}]^2 + \lambda/2(\sum_{i,k}(X_{ik})^2 + \sum_{j,k}(Y_{jk})^2)$$

In general, we obtain the matrices X and Y following a gradient descent optimization process. And once the matrices are obtained, we can predict the ratings simply by multiplying the user vector by any item vector.

In this Yelp Rating Challenge, we used the python surprise package to implement MF. The MF algorithm there uses the SVD approach, which is essentially 

$$ P_{m * n} = U_{m * m} \sum_{m * n} V_{n * n}$$

There, the prediction is
 $$\hat(r_{ui}) = \mu + b_u + b_i + (q_i)^T p_u $$
 
and the regularized squared error that needs to be minimized is 

$$\sum_{r_{ui} \in R_{train}} (r_{ui} − \hat(r_{ui}))^2 + \lambda(b^2_{i} + b^2_{u} + ||q_i||^2 + ||p_u||^2)$$

As the way the package is designed, we tuned on n_epochs, lr_all and leg_all to get an optimal hyperparameter set, where n_epochs is the number of iterations of the SGD (stochastic gradient descent) procedure, lr_all is the learning rate for all parameters, and reg_all is the regularization term for all parameters.  


In [129]:
# transform to work with surprise, 20%, 50% and full dataset
reader = Reader(rating_scale = (0.0, 5.0))
train_data_20 = Dataset.load_from_df(trainset_20[['userID','itemID','rating']], reader)
val_data_20 = Dataset.load_from_df(valset_20[['userID','itemID','rating']], reader)
test_data_20 = Dataset.load_from_df(testset_20[['userID','itemID','rating']], reader)

train_data_50 = Dataset.load_from_df(trainset_50[['userID','itemID','rating']], reader)
val_data_50 = Dataset.load_from_df(valset_50[['userID','itemID','rating']], reader)
test_data_50 = Dataset.load_from_df(testset_50[['userID','itemID','rating']], reader)

train_data_100 = Dataset.load_from_df(trainset_100[['userID','itemID','rating']], reader)
val_data_100 = Dataset.load_from_df(valset_100[['userID','itemID','rating']], reader)
test_data_100 = Dataset.load_from_df(testset_100[['userID','itemID','rating']], reader)

In [130]:
train_sr_20 = train_data_20.build_full_trainset()
val_sr_before_20 = val_data_20.build_full_trainset()
val_sr_20 = val_sr_before_20.build_testset()
test_sr_before_20 = test_data_20.build_full_trainset()
test_sr_20 = test_sr_before_20.build_testset()

train_sr_50 = train_data_50.build_full_trainset()
val_sr_before_50 = val_data_20.build_full_trainset()
val_sr_50 = val_sr_before_20.build_testset()
test_sr_before_50 = test_data_50.build_full_trainset()
test_sr_50 = test_sr_before_50.build_testset()

train_sr_100 = train_data_100.build_full_trainset()
val_sr_before_100 = val_data_100.build_full_trainset()
val_sr_100 = val_sr_before_100.build_testset()
test_sr_before_100 = test_data_100.build_full_trainset()
test_sr_100 = test_sr_before_100.build_testset()

## Tuning

In [56]:
RMSE_tune = {}

In [51]:
n_epochs = [5, 7, 10]  # the number of iteration of the SGD procedure
lr_all = [0.002, 0.003, 0.005] # the learning rate for all parameters
reg_all =  [0.4, 0.5, 0.6] # the regularization term for all parameters

In [57]:
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr_20)
            predictions = algo.test(val_sr_20)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)

RMSE: 1.4159
RMSE: 1.4173
RMSE: 1.4174
RMSE: 1.4012
RMSE: 1.4016
RMSE: 1.4032
RMSE: 1.3798
RMSE: 1.3807
RMSE: 1.3821
RMSE: 1.4042
RMSE: 1.4047
RMSE: 1.4058
RMSE: 1.3873
RMSE: 1.3885
RMSE: 1.3900
RMSE: 1.3650
RMSE: 1.3665
RMSE: 1.3672
RMSE: 1.3895
RMSE: 1.3903
RMSE: 1.3918
RMSE: 1.3714
RMSE: 1.3735
RMSE: 1.3747
RMSE: 1.3497
RMSE: 1.3508
RMSE: 1.3527


In [58]:
RMSE_tune

{(5, 0.002, 0.4): 1.415933676012248,
 (5, 0.002, 0.5): 1.4173365973852405,
 (5, 0.002, 0.6): 1.417370198820145,
 (5, 0.003, 0.4): 1.4011758726577428,
 (5, 0.003, 0.5): 1.4016120313912452,
 (5, 0.003, 0.6): 1.4031618552800484,
 (5, 0.005, 0.4): 1.3798484285012471,
 (5, 0.005, 0.5): 1.3806636393067282,
 (5, 0.005, 0.6): 1.3821350022120953,
 (7, 0.002, 0.4): 1.4042009585091328,
 (7, 0.002, 0.5): 1.4047180140602467,
 (7, 0.002, 0.6): 1.405780300835095,
 (7, 0.003, 0.4): 1.3872838266494933,
 (7, 0.003, 0.5): 1.3885483890855905,
 (7, 0.003, 0.6): 1.3900448377490269,
 (7, 0.005, 0.4): 1.3649676001324043,
 (7, 0.005, 0.5): 1.3665228974538999,
 (7, 0.005, 0.6): 1.3671758260897764,
 (10, 0.002, 0.4): 1.3894940834924787,
 (10, 0.002, 0.5): 1.3903377177304177,
 (10, 0.002, 0.6): 1.3917867965697626,
 (10, 0.003, 0.4): 1.3714281812046991,
 (10, 0.003, 0.5): 1.3734950396997587,
 (10, 0.003, 0.6): 1.3746840315086657,
 (10, 0.005, 0.4): 1.3497144262970868,
 (10, 0.005, 0.5): 1.3507892432826967,
 (10, 0

## Results

In [80]:
# so the best is when n_epochs = 10, lr_all = 0.005, reg_all = 0.4,

In [131]:
algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)
start_time = time.time()
algo.fit(train_sr_20)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c1a0f4b70>

In [132]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 22.90842628479004 seconds ---


In [174]:
prediction_20 = algo.test(test_sr_20)
prediction_20_df = pd.DataFrame(prediction_20, columns = ['userId','itemId','rating','pred_rating','x'])
start_time = time.time()
accuracy.rmse(predictions_20)
print('R^2 (with 50% data): ', r2_score(prediction_20_df.rating , prediction_20_df.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(prediction_20_df.rating, prediction_20_df.pred_rating))

RMSE: 1.3985
R^2 (with 50% data):  -0.0005168104941091212
MAE (with 50% data):  1.3130548262648378
--- 0.058664798736572266 seconds ---


In [136]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.38172292709350586 seconds ---


In [140]:
algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)
start_time = time.time()
algo.fit(train_sr_50)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1be7d1b390>

In [141]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 60.414530754089355 seconds ---


In [156]:
prediction_50 = algo.test(test_sr_50)
start_time = time.time()
prediction_50_df = pd.DataFrame(prediction_50, columns = ['userId','itemId','rating','pred_rating','x'])
accuracy.rmse(predictions_50)
print('R^2 (with 50% data): ', r2_score(prediction_50_df.rating , prediction_50_df.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(prediction_50_df.rating, prediction_50_df.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE: 1.3795
R^2 (with 50% data):  0.17579827157194183
MAE (with 50% data):  1.1636053411757143
--- 0.22292780876159668 seconds ---


In [157]:
algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)
start_time = time.time()
algo.fit(train_sr_100)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1be7d1bb00>

In [158]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 112.2750883102417 seconds ---


In [160]:
prediction_100 = algo.test(test_sr_100)
start_time = time.time()
prediction_100_df = pd.DataFrame(prediction_100, columns = ['userId','itemId','rating','pred_rating','x'])
accuracy.rmse(prediction_100)
print('R^2 (with 100% data): ', r2_score(prediction_100_df.rating , prediction_100_df.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error(prediction_100_df.rating, prediction_100_df.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE: 1.4097
R^2 (with 100% data):  0.14071587483799064
MAE (with 100% data):  1.1899410508462416
--- 0.39083409309387207 seconds ---


## Evaluate

In [24]:
algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)
eval_before_50 = eval_50.build_full_trainset()
eval_sr_50 = eval_before_50.build_testset()
algo.fit(train_sr_50)
eval_pred_50 = algo.test(eval_sr_50)
#accuracy.rmse(predictions_50)
baseline_50 = pd.DataFrame(eval_pred_50, columns = ['userId','itemId','rating','pred_rating','x'])

algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)
eval_before_100 = eval_100.build_full_trainset()
eval_sr_100 = eval_before_100.build_testset()
algo.fit(train_sr_100)
eval_pred_100 = algo.test(eval_sr_100)
#accuracy.rmse(predictions_100)
baseline_100 = pd.DataFrame(eval_pred_100, columns = ['userId','itemId','rating','pred_rating','x'])

In [210]:
def get_all_metrics(predict_df, validation_subsample, ratings_train_final):
    top_10_recs = predict_df.groupby(['user_id','city'])['predictions'].nlargest(10).reset_index()
    out = validation_subsample
    cnt =0
    serendipity = 0
    
    
    for row in out.iterrows():
        row_values = row[1]
        top_10 = predict_df.loc[top_10_recs[top_10_recs['user_id'] == row_values['user_id']].level_2]['business_id']
        ###In top 10
        if row_values['business_id'] in top_10.values:
            cnt+=1
        user_history = ratings_train_final[ratings_train_final['user_id'] == row_values['user_id']]    
        been_there = [i for i in top_10.values if i in  user_history.business_id.values]
        serendipity += 1-len(been_there)/10
    
    top_10 = cnt/len(out)
    serendipity = serendipity/len(out)
    
    predict_df = predict_df.reset_index()
    
    analysis_df = predict_df.merge(top_10_recs, left_on = ['user_id','city','index'], right_on = ['user_id','city','level_2'])
    
    coverage = (analysis_df.groupby('city')['business_id'].nunique()/50).values.mean()
    
    predict_df['rankings']=predict_df.groupby(['city','user_id'])['predictions'].rank("first",ascending = False)
    running_rankings =0
    for row in out.iterrows():
        row_values = row[1]
        user_recs = predict_df[(predict_df['user_id']==row_values['user_id'])
                            &(predict_df['city']==row_values['city'])
                             & (predict_df['business_id']==row_values['business_id'])
                              ]
        #assert len(user_recs)==1
        running_rankings += user_recs['rankings'].sum()

    avg_rank = running_rankings / len(out)
    print(top_10, coverage, serendipity, avg_rank)
    
    return top_10, coverage, serendipity, avg_rank

In [139]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)

0.10714285714285714 0.503095238095238 0.9697619047619042 528.3333333333334


In [27]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

0.12678571428571428 0.2 0.9799999999999989 422.8767857142857


In [209]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

0.014516129032258065 0.2 0.9983870967741933 6.670967741935484


## Collective Matrix Factorization

To best the baseline (which in our case, is Matrix Factorization), we first tried the Collective Matrix Factorization technique. This means that, in addition to the ratings, we also want to include additional features on either the items or the users, to see how the model perform. Before we discussed what additional features we included, we might say a few words on the method and its mathematical intuition.

Similar to Matrix Factorization, Collective Matrix Factorization also aims to decompose, but in this time, we are decomposing two matrices $X$ and $Y$ into three matrices $U$, $V$, and $Z$, such that $X \approx f(UV^T)$ and $Y \approx f(VZ^T)$. Here, X can be the same matrix we see in the case of Matrix Factorization, i.e., a simple user-item matrix filled by ratings. And matrix Y is the matrix of additional features. It can be on user or on item. And the features can be one-hot encoded (i.e., categorical) or numerical. For example, in our case, we both involved state location for businesses (which is a categorical feature) and the average rating (which is a numerical feature).

Collective Matrix Factorization also has a function to minimize, and we will explain the function through the lens of the python cmfrec package, which is the package that we used to implement Collective Matrix Factorization. The function to minimize is as follows:

$ argmin_{A, B, C, D, U_b, I_b} \lVert (X − \mu − U_b − I_b − AB^T)I_{x} \lVert^2 + \lVert U − AC^T \lVert^2 + \lVert I − BD^T \lVert^2 + \lambda (\lVert A\lVert^2 + \lVert B \lVert^2 + \lVert C \lVert^2 + \lVert D \lVert^2  + \lVert U_b \lVert^2 + \lVert I_b \lVert^2)$

Where X is the ratings matrix, I is the item-attribute matrix, U is the user-attribute matrix, and A,B,C,D are lower-dimensional matrices. $ |X| $, $|I|$, $|U|$ are the number of non-missing entries in each matrix. And $ Ubias_{u}$ and $Ibias_{i}$ are user and item biases. Thus, for such a reason, when tuning, we tune w_main and w_item when we are including additional item features and we tune w_main and w_user when we are including additional user features.

Now, with CMF, we implemented three approaches in total- state average rating, state location, and user average rating. The idea is as follows:
We first included state average rating as an item additional feature. Since our objective here is to predict the last rating for each active user, we wanted to see if location can be a useful means to bring closer the predicted ratings to the actual ones. To better utilize the location feature though, we came up with two approaches. One is to calculate the state average rating, and the other is to use state location as a categorical feature. With the state average rating, we wanted to observe if some states can have a higher average rating than others. This means that either the restaurants in that state are significantly better or the users in that state are more lenient and friendly. From EDA alone, the average rating seems to make some sense, as we observed that California has the highest state average rating (though with much fewer data observations) and New York has the lowest (same, with much fewer data observations than AZ and NV). This might suggest that, even though New York (especially manhattan area) is known as a food hub, the yelp users here can be a little picky and critical, whereas the users in CA can be more friendly. By all means though, we wanted to see if this “secondary” information that we generated for ourselves can help better predict the result.

With the second approach on the location feature, we simply fed in the one-hot encoded columns. This time, we just want to see if the location itself, as a categorical feature, can bring us any closer to the actual value.

Lastly, we also did a user average rating for the user info. This is similar to the state average rating approach, but we just wanted to see if by directly analyzing the users’ behaviors, that normally how critical they are with the restaurants, can help us better understand them and have a better predicted result than other approaches. To notice here though, we didn’t use the given average rating, since that would include the last rating that we were trying to predict; thus, we had to hand-calculated the average rating again on the training set. We also wanted to include the yelp spending time, that is the time they been using yelp and see if this can help us better predict the result. But due to the time constraint, we are not able to implement. 



In [74]:
# for cmfrec package
# universal X_train_20, 50, 100
X_train_20 = trainset_20
X_train_20.columns = ['UserId','ItemId','Rating']

X_train_50 = trainset_50
X_train_50.columns = ['UserId','ItemId','Rating']

X_train_100 = trainset_100
X_train_100.columns = ['UserId','ItemId','Rating']

In [75]:
# universal X_test_20, 50, 100
X_test_20 = testset_20
X_test_20.columns = ['UserId','ItemId','Rating']

X_test_50 = testset_50
X_test_50.columns = ['UserId','ItemId','Rating']

X_test_100 = testset_100
X_test_100.columns = ['UserId','ItemId','Rating']

### Approach 1: State Average Rating

In [76]:
# get state average rating
state_avg_20 = pd.DataFrame(ratings_train_20.groupby("state").rating.mean())
state_avg_20.columns = ['state_avg']
train_state_avg_20 = ratings_train_20.merge(state_avg_20, on = "state")

state_avg_50 = pd.DataFrame(ratings_train_50.groupby("state").rating.mean())
state_avg_50.columns = ['state_avg']
train_state_avg_50 = ratings_train_50.merge(state_avg_50, on = "state")

state_avg_100 = pd.DataFrame(ratings_train_100.groupby("state").rating.mean())
state_avg_100.columns = ['state_avg']
train_state_avg_100 = ratings_train_100.merge(state_avg_100, on = "state")

In [77]:
# item additional info: state average
item_avg_20 = train_state_avg_20.loc[:,['business_id','state_avg']]
item_avg_20.columns = ['ItemId','state_avg']

item_avg_50 = train_state_avg_50.loc[:,['business_id','state_avg']]
item_avg_50.columns = ['ItemId','state_avg']

item_avg_100 = train_state_avg_100.loc[:,['business_id','state_avg']]
item_avg_100.columns = ['ItemId','state_avg']

### 1.1 Tuning

In [117]:
tune = {}

In [118]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [121]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_avg_20))
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.951550
  Number of iterations: 378
  Number of functions evaluations: 429
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.951519
  Number of iterations: 307
  Number of functions evaluations: 351
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.950972
  Number of iterations: 99
  Number of functions evaluations: 110
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.175318
  Number of iterations: 1000
  Number of functions evaluations: 1077
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.163224
  Number of i

In [123]:
tune

{(0.5, 10.0): 1.4095315793715344,
 (0.5, 5.0): 1.4095280852831702,
 (0.5, 0.5): 1.4095288030679354,
 (5.0, 10.0): 1.325081092000988,
 (5.0, 5.0): 1.3249653813051294,
 (5.0, 0.5): 1.325319757480712,
 (10.0, 10.0): 1.3303194326966257,
 (10.0, 5.0): 1.3291943801778212,
 (10.0, 0.5): 1.329444185787565}

### 1.2 Results

In [None]:
### Best param: w_main = 5.0, w_item = 5.0

In [125]:
# 20%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_avg_20))

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.163018
  Number of iterations: 1000
  Number of functions evaluations: 1048


<cmfrec.CMF at 0x1b80676f60>

In [126]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 824.5971691608429 seconds ---


In [153]:
X_test_20 = X_test_20.loc[X_test_20.ItemId.isin(X_train_20.ItemId)]
X_test_50 = X_test_50.loc[X_test_50.ItemId.isin(X_train_50.ItemId)]
X_test_100 = X_test_100.loc[X_test_100.ItemId.isin(X_train_100.ItemId)]

In [137]:
state_prediction_20 = model.predict(X_test_20.UserId, X_test_20.ItemId)
X_test_20['pred_rating'] = state_prediction_20
print('RMSE (with 20% data): ', np.sqrt(np.mean((X_test_20.pred_rating - X_test_20.Rating)**2)))
print('R^2 (with 20% data): ', r2_score(X_test_20.Rating ,X_test_20.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(X_test_20.Rating, X_test_20.pred_rating))

RMSE (with 20% data):  1.3495188130540272
R^2 (with 20% data):  0.18677513147304292
MAE (with 20% data):  1.1202827761356042


In [154]:
# 50%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(item_avg_50))

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 7.329828
  Number of iterations: 1000
  Number of functions evaluations: 1070


<cmfrec.CMF at 0x2089df7a90>

In [155]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1569.4463019371033 seconds ---


In [159]:
state_prediction_50 = model.predict(X_test_50.UserId, X_test_50.ItemId)
X_test_50['pred_rating'] = state_prediction_50
print('RMSE (with 50% data): ', np.sqrt(np.mean((X_test_50.pred_rating - X_test_50.Rating)**2)))
print('R^2 (with 50% data): ', r2_score(X_test_50.Rating, X_test_50.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(X_test_50.Rating,X_test_50.pred_rating))

RMSE (with 50% data):  1.3721222242704587
R^2 (with 50% data):  0.1782666934054964
MAE (with 50% data):  1.1524687846675643


In [None]:
# 100%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_avg_100))

  num_elements)


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
state_prediction_100 = model.predict(X_test_100.UserId, X_test_100.ItemId)
X_test_100['pred_rating'] = state_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.Rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.Rating , X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error(X_test_100.Rating , X_test_100.pred_rating))

### 1.3 Evaluate

In [37]:
predict_df_20.head()

Unnamed: 0,user_id,city,state,business_id,predictions
0,4_rUho9z3p91M1r9hqA7Bg,Ajax,ON,8AW0koYMDa1PlJMOE-b2-g,3.237104
1,4_rUho9z3p91M1r9hqA7Bg,Ajax,ON,-YGQwikbX2fXUIjyegR7pw,3.680566
2,4_rUho9z3p91M1r9hqA7Bg,Ajax,ON,5Kh5i4VhXj-Leg8gujIzjQ,3.642935
3,4_rUho9z3p91M1r9hqA7Bg,Ajax,ON,Wl1oOVbtK4I9vRKoaSKYiQ,3.359295
4,4_rUho9z3p91M1r9hqA7Bg,Ajax,ON,OxSaGGTmIujsjDpDqwyGPQ,3.437118


In [42]:
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_avg_20))
predict_df_20['predictions'] = model.predict(predict_df_20.user_id, predict_df_20.business_id)

model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(item_avg_50))
predict_df_50['predictions'] = model.predict(predict_df_50.user_id, predict_df_50.business_id)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.163224
  Number of iterations: 1000
  Number of functions evaluations: 1067
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 7.329798
  Number of iterations: 1000
  Number of functions evaluations: 1069


In [None]:
model = CMF(w_main = m, w_item = i, random_seed = 1)
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_avg_100))
predict_df_100['predictions'] = model.predict(predict_df_100.user_id, predict_df_100.business_id)

In [43]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)


0.13095238095238096 0.2609523809523809 0.9695238095238087 484.93809523809523


In [44]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

0.18035714285714285 0.2041071428571429 0.9792857142857132 448.2767857142857


In [211]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

KeyboardInterrupt: 

## Approach 2: State Location

### 2.1 Transforming Data and Tunning for Surprise

In [78]:
# one-hot encode
df_state_20 = pd.get_dummies(ratings_train_20.state)
df_state_50 = pd.get_dummies(ratings_train_50.state)
df_state_100 = pd.get_dummies(ratings_train_100.state)

In [79]:
state_loc_20 = ratings_train_20.loc[:,['business_id']]
state_loc_20.columns = ['ItemId']
state_loc_20 = state_loc_20.join(df_state_20)

state_loc_50 = ratings_train_50.loc[:,['business_id']]
state_loc_50.columns = ['ItemId']
state_loc_50 = state_loc_50.join(df_state_50)

state_loc_100 = ratings_train_100.loc[:,['business_id']]
state_loc_100.columns = ['ItemId']
state_loc_100 = state_loc_100.join(df_state_100)

In [141]:
tune_2 = {}

In [142]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [143]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(state_loc_20),\
            cols_bin_item=[cl for cl in state_loc_20.columns if cl != 'ItemId'])
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune_2[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))
        

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.502233
  Number of iterations: 48
  Number of functions evaluations: 59
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.345407
  Number of iterations: 49
  Number of functions evaluations: 61
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.946406
  Number of iterations: 46
  Number of functions evaluations: 58
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.707548
  Number of iterations: 1000
  Number of functions evaluations: 1050
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.544841
  Number of iter

### 2.2 Results

In [150]:
# get the best param: w_main = 5.0, w_item = 5.0
tune_2

{(0.5, 10.0): 1.4091453521706512,
 (0.5, 5.0): 1.4093103607696387,
 (0.5, 0.5): 1.4096837868418028,
 (5.0, 10.0): 1.3255848781332549,
 (5.0, 5.0): 1.3253454951331827,
 (5.0, 0.5): 1.3255074066479042,
 (10.0, 10.0): 1.3287372061500575,
 (10.0, 5.0): 1.3285188529615481,
 (10.0, 0.5): 1.3288867514741107}

In [83]:
X_test_20 = X_test_20.loc[X_test_20.ItemId.isin(X_train_20.ItemId)]
X_test_50 = X_test_50.loc[X_test_50.ItemId.isin(X_train_50.ItemId)]
X_test_100 = X_test_100.loc[X_test_100.ItemId.isin(X_train_100.ItemId)]

In [172]:
state_loc_50.head()

Unnamed: 0,ItemId,AB,AL,AR,AZ,BC,CA,CT,FL,GA,...,NY,OH,ON,PA,QC,SC,TX,VA,WA,WI
0,WTqjgwHlXbSFevF32_DJVw,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,WTqjgwHlXbSFevF32_DJVw,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,WTqjgwHlXbSFevF32_DJVw,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,WTqjgwHlXbSFevF32_DJVw,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WTqjgwHlXbSFevF32_DJVw,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
# 20%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(state_loc_20),\
            cols_bin_item=[cl for cl in state_loc_20.columns if cl != 'ItemId'])
print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.545013
  Number of iterations: 1000
  Number of functions evaluations: 1050
--- 846.5123789310455 seconds ---


In [84]:
start_time = time.time()
loc_prediction_20 = model.predict(X_test_20.UserId, X_test_20.ItemId)
X_test_20['pred_rating'] = loc_prediction_20
print('RMSE (with 20% data): ', np.sqrt(np.mean((X_test_20.pred_rating - X_test_20.Rating)**2)))
print('R^2 (with 20% data): ', r2_score(X_test_20.Rating , X_test_20.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(X_test_20.Rating , X_test_20.pred_rating))
print("--- %s seconds ---" % (time.time() - start_time))

RMSE (with 20% data):  1.3496651740914456
R^2 (with 20% data):  0.18659872653728782
MAE (with 20% data):  1.1201697481001782


In [161]:
# 50%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(state_loc_50),\
            cols_bin_item=[cl for cl in state_loc_50.columns if cl != 'ItemId'])
print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 7.754635
  Number of iterations: 424
  Number of functions evaluations: 457
--- 890.4706201553345 seconds ---


In [91]:
start_time = time.time()
loc_prediction_50 = model.predict(X_test_50.UserId, X_test_50.ItemId)
X_test_50['pred_rating'] = loc_prediction_50
print('RMSE (with 50% data): ', np.sqrt(np.mean((X_test_50.pred_rating - X_test_50.Rating)**2)))
print('R^2 (with 50% data): ', r2_score(X_test_50.Rating , X_test_50.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(X_test_50.Rating , X_test_50.pred_rating))

RMSE (with 50% data):  1.3718431111368456
R^2 (with 50% data):  0.17860096862767105
MAE (with 50% data):  1.152111041520463


In [92]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.23462414741516113 seconds ---


In [None]:
# 100%
model = CMF(w_main = m, w_item = i, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_state_avg_100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
loc_prediction_100 = model.predict(X_test_100.user_id, X_test_100.business_id)
X_test_100['pred_rating'] = loc_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.rating - X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error((X_test_100.rating - X_test_100.pred_rating))

### 2.3 Evaluate

In [94]:
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(state_loc_20),\
            cols_bin_item=[cl for cl in state_loc_20.columns if cl != 'ItemId'])
predict_df_20['predictions'] = model.predict(predict_df_20.user_id, predict_df_20.business_id)

model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(state_loc_50),\
            cols_bin_item=[cl for cl in state_loc_50.columns if cl != 'ItemId'])
predict_df_50['predictions'] = model.predict(predict_df_50.user_id, predict_df_50.business_id)

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.545013
  Number of iterations: 1000
  Number of functions evaluations: 1050
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 7.754635
  Number of iterations: 424
  Number of functions evaluations: 457


In [97]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)


0.13095238095238096 0.2602380952380952 0.9692857142857138 473.73571428571427


In [98]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

0.18035714285714285 0.20500000000000004 0.979285714285713 444.30357142857144


In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

### Approach 3: User Average Rating

In [99]:
# get state average rating
user_avg_20 = pd.DataFrame(ratings_train_20.groupby("user_id").rating.mean())
user_avg_20.columns = ['user_avg']
user_avg_20 = ratings_train_20.merge(user_avg_20, on = "user_id")

user_avg_50 = pd.DataFrame(ratings_train_50.groupby("user_id").rating.mean())
user_avg_50.columns = ['user_avg']
user_avg_50 = ratings_train_50.merge(user_avg_50, on = "user_id")

user_avg_100 = pd.DataFrame(ratings_train_100.groupby("user_id").rating.mean())
user_avg_100.columns = ['user_avg']
user_avg_100 = ratings_train_100.merge(user_avg_100, on = "user_id")

In [100]:
# user additional info: user average
user_info_20 = user_avg_20.loc[:,['user_id','user_avg']]
user_info_20.columns = ['UserId','state_avg']

user_info_50 = user_avg_50.loc[:,['user_id','user_avg']]
user_info_50.columns = ['UserId','state_avg']

user_info_100 = user_avg_100.loc[:,['user_id','user_avg']]
user_info_100.columns = ['UserId','state_avg']

### 3.1 Tuning

In [146]:
tune_3 = {}

In [147]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_user = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the user attributes matrix

In [151]:
# tuning
for m in w_main:
    for i in w_user:
        model = CMF(w_main = m, w_user = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), user_info = deepcopy(user_info_20))
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune_3[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.912842
  Number of iterations: 463
  Number of functions evaluations: 530
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.912821
  Number of iterations: 378
  Number of functions evaluations: 436
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.912564
  Number of iterations: 131
  Number of functions evaluations: 147
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.144692
  Number of iterations: 1000
  Number of functions evaluations: 1085
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.126369
  Number of 

### 3.2 Results

In [152]:
# get the best param: w_main = 5.0, w_user = 10.0
tune_3

{(0.5, 10.0): 1.4093796110241372,
 (0.5, 5.0): 1.4093675138007682,
 (0.5, 0.5): 1.409379510028695,
 (5.0, 10.0): 1.3243942755866573,
 (5.0, 5.0): 1.3248028150484914,
 (5.0, 0.5): 1.3249520550165859,
 (10.0, 10.0): 1.3290478291921621,
 (10.0, 5.0): 1.3281606768811247,
 (10.0, 0.5): 1.327849756600441}

In [109]:
testset_50 = ratings_holdout_50.iloc[:, 0:3]
testset_50.columns = ['userID', 'itemID','rating']

X_test_50 = testset_50
X_test_50.columns = ['UserId','ItemId','Rating']

In [None]:
X_test_20 = X_test_20.loc[X_test_20.ItemId.isin(X_train_20.ItemId)]
X_test_50 = X_test_50.loc[X_test_50.ItemId.isin(X_train_50.ItemId)]
X_test_100 = X_test_100.loc[X_test_100.ItemId.isin(X_train_100.ItemId)]

In [48]:
# 20%
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_20), user_info = deepcopy(user_info_20))

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.150842
  Number of iterations: 1000
  Number of functions evaluations: 1098


<cmfrec.CMF at 0x1bf451cac8>

In [49]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 825.6145222187042 seconds ---


In [55]:
start_time = time.time()
user_prediction_20 = model.predict(X_test_20.UserId, X_test_20.ItemId)
X_test_20['pred_rating'] = user_prediction_20
print('RMSE (with 20% data): ', np.sqrt(np.mean((X_test_20.pred_rating - X_test_20.Rating)**2)))
print('R^2 (with 20% data): ', r2_score(X_test_20.Rating , X_test_20.pred_rating))
print('MAE (with 20% data): ', mean_absolute_error(X_test_20.Rating , X_test_20.pred_rating))

RMSE (with 20% data):  1.349372260486216
R^2 (with 20% data):  0.1869517480867452
MAE (with 20% data):  1.1194434259908603


In [56]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.562385082244873 seconds ---


In [101]:
# 50%
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_50), user_info = deepcopy(user_info_50))

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 7.326553
  Number of iterations: 1000
  Number of functions evaluations: 1096


<cmfrec.CMF at 0x1b85f566a0>

In [102]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1636.16885304451 seconds ---


In [110]:
X_test_50 = X_test_50.loc[X_test_50.ItemId.isin(X_train_50.ItemId)]
X_test_100 = X_test_100.loc[X_test_100.ItemId.isin(X_train_100.ItemId)]

In [115]:
user_prediction_50 = model.predict(X_test_50.UserId, X_test_50.ItemId)
start_time = time.time()
X_test_50['pred_rating'] = user_prediction_50
print('RMSE (with 50% data): ', np.sqrt(np.mean((X_test_50.pred_rating - X_test_50.Rating)**2)))
print('R^2 (with 50% data): ', r2_score(X_test_50.Rating , X_test_50.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(X_test_50.Rating , X_test_50.pred_rating))

RMSE (with 50% data):  1.3721383633420057
R^2 (with 50% data):  0.17824736263395613
MAE (with 50% data):  1.152321186693606


In [116]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.28719615936279297 seconds ---


In [None]:
# 100%
model = CMF(w_main = 5.0, w_item = 10.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), user_info = deepcopy(user_info_100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
user_prediction_100 = model.predict(X_test_100.user_id, X_test_100.business_id)
X_test_100['pred_rating'] = user_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.rating - X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error((X_test_100.rating - X_test_100.pred_rating))

### 3.3 Evaluate

In [60]:
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_20), user_info = deepcopy(user_info_20))
predict_df_20['predictions'] = model.predict(predict_df_20.user_id, predict_df_20.business_id)

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.141467
  Number of iterations: 1000
  Number of functions evaluations: 1066


In [118]:
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_50), user_info = deepcopy(user_info_50))
predict_df_50['predictions'] = model.predict(predict_df_50.user_id, predict_df_50.business_id)

INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 7.326606
  Number of iterations: 1000
  Number of functions evaluations: 1080


In [61]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)


0.1357142857142857 0.2580952380952381 0.9699999999999993 470.53333333333336


In [119]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

0.18035714285714285 0.20553571428571435 0.9794642857142847 440.3589285714286
