## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [1]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import pycmf

import time
from copy import deepcopy

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset

import matplotlib.pyplot as plt
import tarfile
import json
from tqdm import tqdm

In [2]:
# load business.json
# 192609 unique businesses?
line_count = len(open("./yelp_dataset/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts, attributes, categories = [], [], [], [], [], [], [], [], []
with open("./yelp_dataset/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        attributes += [blob["attributes"]]
        categories += [blob["categories"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts, "attributes": attributes, "categories":categories }
)

100%|██████████| 192609/192609 [00:02<00:00, 72168.99it/s]


In [3]:
# load user.json
# 1637138 unique users?
line_count = len(open("./yelp_dataset/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("./yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:21<00:00, 75685.86it/s]


In [4]:
# load review.json
# 6685900 unique reviews?
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, texts = [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
        texts += [blob["text"]]
reviews = pd.DataFrame(
    {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text": texts}
)
user_counts = reviews["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()
reviews = reviews.loc[reviews.user_id.isin(active_users)]

100%|██████████| 6685900/6685900 [00:58<00:00, 113366.86it/s]


In [5]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

## Loading Data: 20%, 50%, 100%


In [7]:
ratings_holdout_20 = pd.read_csv('data/ratings_sample_holdout_20.csv')
ratings_train_20 = pd.read_csv('data/ratings_sample_train_20.csv')
ratings_val_20 = pd.read_csv('data/ratings_sample_cv_20.csv')

ratings_holdout_50 = pd.read_csv('data/ratings_sample_holdout_50.csv')
ratings_val_50 = pd.read_csv('data/ratings_sample_cv_50.csv')
ratings_train_50 = pd.read_csv('data/ratings_sample_train_50.csv')

ratings_holdout_100 = pd.read_csv('data/ratings_sample_holdout_100.csv')
ratings_train_100 = pd.read_csv('data/ratings_sample_train_100.csv')
ratings_val_100 = pd.read_csv('data/ratings_sample_cv_100.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
ratings_train_20 = process(ratings_train_20.copy())
ratings_holdout_20 = process(ratings_holdout_20.copy())
ratings_val_20 = process(ratings_val_20.copy())

ratings_train_50 = process(ratings_train_50.copy())
ratings_holdout_50 = process(ratings_holdout_50.copy())
ratings_val_50 = process(ratings_val_50.copy())

ratings_val_100 = process(ratings_val_100.copy())
ratings_train_100 = process(ratings_train_100.copy())
ratings_holdout_100 = process(ratings_holdout_100.copy())

In [9]:
ratings_test_20 = ratings_holdout_20.loc[ratings_holdout_20.business_id.isin(ratings_train_20.business_id)]
ratings_val_20 = ratings_val_20.loc[ratings_val_20.business_id.isin(ratings_train_20.business_id)]

ratings_test_50 = ratings_holdout_50.loc[ratings_holdout_50.business_id.isin(ratings_train_50.business_id)]
ratings_val_50 = ratings_val_50.loc[ratings_val_50.business_id.isin(ratings_train_50.business_id)]

ratings_test_100 = ratings_holdout_100.loc[ratings_holdout_100.business_id.isin(ratings_train_100.business_id)]
ratings_val_100 = ratings_val_100.loc[ratings_val_100.business_id.isin(ratings_train_100.business_id)]

In [10]:
trainset_20 = ratings_train_20.iloc[:,0:3]
trainset_20.columns = ['userID', 'itemID','rating']
valset_20 = ratings_val_20.iloc[:, 0:3]
valset_20.columns = ['userID', 'itemID','rating']
testset_20 = ratings_holdout_20.iloc[:, 0:3]
testset_20.columns = ['userID', 'itemID','rating']

trainset_50 = ratings_train_50.iloc[:,0:3]
trainset_50.columns = ['userID', 'itemID','rating']
valset_50 = ratings_val_50.iloc[:, 0:3]
valset_50.columns = ['userID', 'itemID','rating']
testset_50 = ratings_holdout_50.iloc[:, 0:3]
testset_50.columns = ['userID', 'itemID','rating']

trainset_100 = ratings_train_100.iloc[:,0:3]
trainset_100.columns = ['userID', 'itemID','rating']
valset_100 = ratings_val_100.iloc[:, 0:3]
valset_100.columns = ['userID', 'itemID','rating']
testset_100 = ratings_holdout_100.iloc[:, 0:3]
testset_100.columns = ['userID', 'itemID','rating']

In [None]:
ratings_train_final_20 = ratings_train_20.append(ratings_val_20)
ratings_train_final_50 = ratings_train_50.append(ratings_val_50)
ratings_train_final_100 = ratings_train_100.append(ratings_val_100)

In [None]:
ratings_entire_df_20 = ratings_train_20.append(ratings_val_20).append(ratings_holdout_20)
ratings_entire_df_50 = ratings_train_50.append(ratings_val_50).append(ratings_holdout_50)
ratings_entire_df_100 = ratings_train_100.append(ratings_val_100).append(ratings_holdout_100)

In [None]:
unique_city_businesses_20 = ratings_entire_df_20[['city','business_id']].drop_duplicates()
unique_cities_20 = unique_city_businesses_20.groupby('city').count()['business_id']
unique_cities_20 = unique_cities_20[unique_cities_20 > 100]
out_20 = pd.DataFrame()
for city in unique_cities_20.index:
    tmp = ratings_holdout_20[(ratings_holdout_20['city'] ==city) &
                              (ratings_holdout_20['rating'] >ratings_holdout_20['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_20 = out_20.append(row)
        
unique_city_businesses_50 = ratings_entire_df_50[['city','business_id']].drop_duplicates()
unique_cities_50 = unique_city_businesses_50.groupby('city').count()['business_id']
unique_cities_50 = unique_cities_50[unique_cities_50 > 100]
out_50 = pd.DataFrame()
for city in unique_cities_50.index:
    tmp = ratings_holdout_50[(ratings_holdout_50['city'] ==city) &
                              (ratings_holdout_50['rating'] >ratings_holdout_50['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_50 = out_50.append(row)
        
unique_city_businesses_100 = ratings_entire_df_100[['city','business_id']].drop_duplicates()
unique_cities_100 = unique_city_businesses_100.groupby('city').count()['business_id']
unique_cities_100 = unique_cities_100[unique_cities_100 > 100]
out_100 = pd.DataFrame()
for city in unique_cities_100.index:
    tmp = ratings_holdout_100[(ratings_holdout_100['city'] ==city) &
                              (ratings_holdout_100['rating'] >ratings_holdout_100['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_100 = out_100.append(row)

In [None]:
predict_df_20 = out_20[['user_id','city','state']]
predict_df_20 = predict_df_20.merge(unique_city_businesses_20, on = 'city')
predict_df_20['predictions'] = 25

predict_df_50 = out_50[['user_id','city','state']]
predict_df_50 = predict_df_50.merge(unique_city_businesses_50, on = 'city')
predict_df_50['predictions'] = 25

predict_df_100 = out_50[['user_id','city','state']]
predict_df_100 = predict_df_100.merge(unique_city_businesses_100, on = 'city')
predict_df_100['predictions'] = 25

## CMF

In [39]:
# for cmfrec package
# universal X_train_20, 50, 100
X_train_20 = trainset_20
X_train_20.columns = ['UserId','ItemId','Rating']

X_train_50 = trainset_50
X_train_50.columns = ['UserId','ItemId','Rating']

X_train_100 = trainset_100
X_train_100.columns = ['UserId','ItemId','Rating']

In [40]:
# universal X_test_20, 50, 100
X_test_20 = testset_20
X_test_20.columns = ['UserId','ItemId','Rating']


X_test_50 = testset_50
X_train_50.columns = ['UserId','ItemId','Rating']

X_test_100 = testset_100
X_train_100.columns = ['UserId','ItemId','Rating']

In [41]:
# universal X_val_20
X_val_20 = valset_20
X_val_20.columns = ['UserId','ItemId','Rating']

### 1. State Average Rating

In [34]:
# get state average rating
state_avg_20 = pd.DataFrame(ratings_train_20.groupby("state").rating.mean())
state_avg_20.columns = ['state_avg']
train_state_avg_20 = ratings_train_20.merge(state_avg_20, on = "state")

state_avg_50 = pd.DataFrame(ratings_train_50.groupby("state").rating.mean())
state_avg_50.columns = ['state_avg']
train_state_avg_50 = ratings_train_50.merge(state_avg_50, on = "state")

state_avg_100 = pd.DataFrame(ratings_train_100.groupby("state").rating.mean())
state_avg_100.columns = ['state_avg']
train_state_avg_100 = ratings_train_100.merge(state_avg_100, on = "state")

In [35]:
# item additional info: state average
item_avg_20 = train_state_avg_20.loc[:,['business_id','state_avg']]
item_avg_20.columns = ['ItemId','state_avg']

item_avg_50 = train_state_avg_50.loc[:,['business_id','state_avg']]
item_avg_50.columns = ['ItemId','state_avg']

item_avg_100 = train_state_avg_100.loc[:,['business_id','state_avg']]
item_avg_100.columns = ['ItemId','state_avg']

In [117]:
tune = {}

In [118]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [121]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_avg_20))
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.951550
  Number of iterations: 378
  Number of functions evaluations: 429
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.951519
  Number of iterations: 307
  Number of functions evaluations: 351
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.950972
  Number of iterations: 99
  Number of functions evaluations: 110
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.175318
  Number of iterations: 1000
  Number of functions evaluations: 1077
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.163224
  Number of i

In [123]:
tune

{(0.5, 10.0): 1.4095315793715344,
 (0.5, 5.0): 1.4095280852831702,
 (0.5, 0.5): 1.4095288030679354,
 (5.0, 10.0): 1.325081092000988,
 (5.0, 5.0): 1.3249653813051294,
 (5.0, 0.5): 1.325319757480712,
 (10.0, 10.0): 1.3303194326966257,
 (10.0, 5.0): 1.3291943801778212,
 (10.0, 0.5): 1.329444185787565}

### 1.2 Results

In [None]:
### Best param: w_main = 5.0, w_item = 5.0

In [None]:
# 100%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_avg_100))

  num_elements)


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
state_prediction_100 = model.predict(X_test_100.UserId, X_test_100.ItemId)
X_test_100['pred_rating'] = state_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.Rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.Rating , X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error(X_test_100.Rating , X_test_100.pred_rating))

### 1.3 Evaluate

In [None]:
model = CMF(w_main = m, w_item = i, random_seed = 1)
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_avg_100))
predict_df_100['predictions'] = model.predict(predict_df_100.user_id, predict_df_100.business_id)

In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

## 2. State Location

### 2.1 Transforming Data and Tunning

In [139]:
# one-hot encode
df_state_20 = pd.get_dummies(ratings_train_20.state)
df_state_50 = pd.get_dummies(ratings_train_50.state)
df_state_100 = pd.get_dummies(ratings_train_100.state)

In [140]:
state_loc_20 = ratings_train_20.loc[:,['business_id']]
state_loc_20.columns = ['ItemId']
state_loc_20 = state_loc_20.join(df_state_20)

state_loc_50 = ratings_train_50.loc[:,['business_id']]
state_loc_50.columns = ['ItemId']
state_loc_50 = state_loc_50.join(df_state_50)

state_loc_100 = ratings_train_100.loc[:,['business_id']]
state_loc_100.columns = ['ItemId']
state_loc_100 = state_loc_100.join(df_state_100)

In [141]:
tune_2 = {}

In [142]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [143]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(state_loc_20),\
            cols_bin_item=[cl for cl in state_loc_20.columns if cl != 'ItemId'])
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune_2[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))
        

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.502233
  Number of iterations: 48
  Number of functions evaluations: 59
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.345407
  Number of iterations: 49
  Number of functions evaluations: 61
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.946406
  Number of iterations: 46
  Number of functions evaluations: 58
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.707548
  Number of iterations: 1000
  Number of functions evaluations: 1050
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.544841
  Number of iter

### 2.2 Results

In [150]:
# get the best param: w_main = 5.0, w_item = 5.0
tune_2

{(0.5, 10.0): 1.4091453521706512,
 (0.5, 5.0): 1.4093103607696387,
 (0.5, 0.5): 1.4096837868418028,
 (5.0, 10.0): 1.3255848781332549,
 (5.0, 5.0): 1.3253454951331827,
 (5.0, 0.5): 1.3255074066479042,
 (10.0, 10.0): 1.3287372061500575,
 (10.0, 5.0): 1.3285188529615481,
 (10.0, 0.5): 1.3288867514741107}

In [None]:
# 100%
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(state_loc_100),\
            cols_bin_item=[cl for cl in state_loc_100.columns if cl != 'ItemId'])


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
loc_prediction_100 = model.predict(X_test_100.UserId, X_test_100.ItemId)
X_test_100['pred_rating'] = loc_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.Rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.Rating , X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error(X_test_100.Rating , X_test_100.pred_rating))

### 2.3 Evaluate

In [32]:
model = CMF(w_main = 5.0, w_item = 5.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(state_loc_100),\
            cols_bin_item=[cl for cl in state_loc_100.columns if cl != 'ItemId'])
predict_df_100['predictions'] = model.predict(predict_df_100.user_id, predict_df_100.business_id)

NameError: name 'm' is not defined

In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

### 3. User Average Rating

In [45]:
# get state average rating
user_avg_20 = pd.DataFrame(ratings_train_20.groupby("user_id").rating.mean())
user_avg_20.columns = ['user_avg']
user_avg_20 = ratings_train_20.merge(user_avg_20, on = "user_id")

user_avg_50 = pd.DataFrame(ratings_train_50.groupby("user_id").rating.mean())
user_avg_50.columns = ['user_avg']
user_avg_50 = ratings_train_50.merge(user_avg_50, on = "user_id")

user_avg_100 = pd.DataFrame(ratings_train_100.groupby("user_id").rating.mean())
user_avg_100.columns = ['user_avg']
user_avg_100 = ratings_train_100.merge(user_avg_100, on = "user_id")

In [46]:
# user additional info: user average
user_info_20 = user_avg_20.loc[:,['user_id','user_avg']]
user_info_20.columns = ['UserId','state_avg']

user_info_50 = user_avg_50.loc[:,['user_id','user_avg']]
user_info_50.columns = ['UserId','state_avg']

user_info_100 = user_avg_100.loc[:,['user_id','user_avg']]
user_info_100.columns = ['UserId','state_avg']

In [146]:
tune_3 = {}

In [147]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_user = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the user attributes matrix

In [151]:
# tuning
for m in w_main:
    for i in w_user:
        model = CMF(w_main = m, w_user = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), user_info = deepcopy(user_info_20))
        prediction = model.predict(X_val_20.UserId, X_val_20.ItemId)
        X_val_20['pred_rating'] = prediction
        tune_3[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.Rating)**2))

INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.912842
  Number of iterations: 463
  Number of functions evaluations: 530
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 0.912821
  Number of iterations: 378
  Number of functions evaluations: 436
INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 0.912564
  Number of iterations: 131
  Number of functions evaluations: 147
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.144692
  Number of iterations: 1000
  Number of functions evaluations: 1085
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 6.126369
  Number of 

### 3.2 Results

In [152]:
# get the best param: w_main = 5.0, w_user = 10.0
tune_3

{(0.5, 10.0): 1.4093796110241372,
 (0.5, 5.0): 1.4093675138007682,
 (0.5, 0.5): 1.409379510028695,
 (5.0, 10.0): 1.3243942755866573,
 (5.0, 5.0): 1.3248028150484914,
 (5.0, 0.5): 1.3249520550165859,
 (10.0, 10.0): 1.3290478291921621,
 (10.0, 5.0): 1.3281606768811247,
 (10.0, 0.5): 1.327849756600441}

In [None]:
# 50%
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(user_info_50))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
X_test_50 = X_test_50.loc[X_test_50.ItemId.isin(X_train_50.ItemId)]
X_test_100 = X_test_100.loc[X_test_100.ItemId.isin(X_train_100.ItemId)]

In [None]:
user_prediction_50 = model_1_tune.predict(X_test_50.UserId, X_test_50.ItemId)
X_test_50['pred_rating'] = user_prediction_50
print('RMSE (with 50% data): ', np.sqrt(np.mean((X_test_50.pred_rating - X_test_50.rating)**2)))
print('R^2 (with 50% data): ', r2_score(X_test_50.Rating , X_test_50.pred_rating))
print('MAE (with 50% data): ', mean_absolute_error(X_test_50.Rating , X_test_50.pred_rating))

In [None]:
# 100%
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(user_info_100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
user_prediction_100 = model.predict(X_test_100.UserId, X_test_100.ItemId)
X_test_100['pred_rating'] = user_prediction_100
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.rating)**2)))
print('R^2 (with 100% data): ', r2_score(X_test_100.Rating , X_test_100.pred_rating))
print('MAE (with 100% data): ', mean_absolute_error(X_test_100.Rating , X_test_100.pred_rating))

### 3.3 Evaluate

In [None]:
model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_50), user_info = deepcopy(user_info_50))
predict_df_50['predictions'] = model.predict(predict_df_50.user_id, predict_df_50.business_id)

model = CMF(w_main = 5.0, w_user = 10.0, random_seed = 1)
model.fit(ratings = deepcopy(X_train_100), user_info = deepcopy(user_info_100))
predict_df_100['predictions'] = model.predict(predict_df_100.user_id, predict_df_100.business_id)

In [61]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)


0.1357142857142857 0.2580952380952381 0.9699999999999993 470.53333333333336


In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)