## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [1]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import pycmf

import time

from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset

import matplotlib.pyplot as plt
import tarfile
import json
from tqdm import tqdm

In [2]:
# load business.json
# 192609 unique businesses?
line_count = len(open("./yelp_dataset/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts, attributes, categories = [], [], [], [], [], [], [], [], []
with open("./yelp_dataset/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        attributes += [blob["attributes"]]
        categories += [blob["categories"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts, "attributes": attributes, "categories":categories }
)

100%|██████████| 192609/192609 [00:02<00:00, 77157.18it/s]


In [3]:
# load user.json
# 1637138 unique users?
line_count = len(open("./yelp_dataset/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("./yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:20<00:00, 81318.74it/s] 


In [4]:
# load review.json
# 6685900 unique reviews?
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, texts = [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
        texts += [blob["text"]]
reviews = pd.DataFrame(
    {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text": texts}
)
user_counts = reviews["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()
reviews = reviews.loc[reviews.user_id.isin(active_users)]

100%|██████████| 6685900/6685900 [00:57<00:00, 115327.70it/s]


In [5]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

## Loading Data: 20%, 50%, 100%


In [78]:
ratings_train_20.shape

(803897, 20)

In [79]:
ratings_train_50.shape

(1997023, 20)

In [80]:
ratings_train_100.shape

(1208638, 20)

In [6]:
ratings_holdout_20 = pd.read_csv('data/ratings_sample_holdout_20.csv')
ratings_train_20 = pd.read_csv('data/ratings_sample_train_20.csv')
ratings_val_20 = pd.read_csv('data/ratings_sample_cv_20.csv')

ratings_holdout_50 = pd.read_csv('data/ratings_sample_holdout_50.csv')
ratings_val_50 = pd.read_csv('data/ratings_sample_cv_50.csv')
ratings_train_50 = pd.read_csv('data/ratings_sample_train_50.csv')

ratings_holdout_100 = pd.read_csv('data/ratings_sample_holdout_100.csv')
ratings_train_100 = pd.read_csv('data/ratings_sample_train_100.csv')
ratings_val_100 = pd.read_csv('data/ratings_sample_cv_100.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
ratings_train_20 = process(ratings_train_20.copy())
ratings_holdout_20 = process(ratings_holdout_20.copy())
ratings_val_20 = process(ratings_val_20.copy())

ratings_train_50 = process(ratings_train_50.copy())
ratings_holdout_50 = process(ratings_holdout_50.copy())
ratings_val_50 = process(ratings_val_50.copy())

ratings_val_100 = process(ratings_val_100.copy())
ratings_train_100 = process(ratings_train_100.copy())
ratings_holdout_100 = process(ratings_holdout_100.copy())

In [8]:
ratings_test_20 = ratings_holdout_20.loc[ratings_holdout_20.business_id.isin(ratings_train_20.business_id)]
ratings_val_20 = ratings_val_20.loc[ratings_val_20.business_id.isin(ratings_train_20.business_id)]

ratings_test_50 = ratings_holdout_50.loc[ratings_holdout_50.business_id.isin(ratings_train_50.business_id)]
ratings_val_50 = ratings_val_50.loc[ratings_val_50.business_id.isin(ratings_train_50.business_id)]

ratings_test_100 = ratings_holdout_100.loc[ratings_holdout_100.business_id.isin(ratings_train_100.business_id)]
ratings_val_100 = ratings_val_100.loc[ratings_val_100.business_id.isin(ratings_train_100.business_id)]

In [9]:
trainset_20 = ratings_train_20.iloc[:,0:3]
trainset_20.columns = ['userID', 'itemID','rating']
valset_20 = ratings_val_20.iloc[:, 0:3]
valset_20.columns = ['userID', 'itemID','rating']
testset_20 = ratings_holdout_20.iloc[:, 0:3]
testset_20.columns = ['userID', 'itemID','rating']

trainset_50 = ratings_train_50.iloc[:,0:3]
trainset_50.columns = ['userID', 'itemID','rating']
valset_50 = ratings_val_50.iloc[:, 0:3]
valset_50.columns = ['userID', 'itemID','rating']
testset_50 = ratings_holdout_50.iloc[:, 0:3]
testset_50.columns = ['userID', 'itemID','rating']

trainset_100 = ratings_train_100.iloc[:,0:3]
trainset_100.columns = ['userID', 'itemID','rating']
valset_100 = ratings_val_100.iloc[:, 0:3]
valset_100.columns = ['userID', 'itemID','rating']
testset_100 = ratings_holdout_100.iloc[:, 0:3]
testset_100.columns = ['userID', 'itemID','rating']

## Baseline: SVD as Matrix Factorization

As a CF algorithm. A matrix factorization technique that reduces the number of features of a data set by reducing space dimensions from N to K where K < N. Thus, in our context, we are finding 2 matrices whose product is the original matrix. 

In [75]:
trainset_20.shape

(803897, 3)

In [76]:
trainset_50.shape

(1997023, 3)

In [62]:
trainset_100.shape

(3965887, 3)

In [10]:
# transform to work with surprise, 20%, 50% and full dataset
reader = Reader(rating_scale = (0.0, 5.0))
train_data_20 = Dataset.load_from_df(trainset_20[['userID','itemID','rating']], reader)
val_data_20 = Dataset.load_from_df(valset_20[['userID','itemID','rating']], reader)
test_data_20 = Dataset.load_from_df(testset_20[['userID','itemID','rating']], reader)

train_data_50 = Dataset.load_from_df(trainset_50[['userID','itemID','rating']], reader)
val_data_50 = Dataset.load_from_df(valset_50[['userID','itemID','rating']], reader)
test_data_50 = Dataset.load_from_df(testset_50[['userID','itemID','rating']], reader)

train_data_100 = Dataset.load_from_df(trainset_100[['userID','itemID','rating']], reader)
val_data_100 = Dataset.load_from_df(valset_100[['userID','itemID','rating']], reader)
test_data_100 = Dataset.load_from_df(testset_100[['userID','itemID','rating']], reader)

In [11]:
train_sr_20 = train_data_20.build_full_trainset()
val_sr_before_20 = val_data_20.build_full_trainset()
val_sr_20 = val_sr_before_20.build_testset()
test_sr_before_20 = test_data_20.build_full_trainset()
test_sr_20 = test_sr_before_20.build_testset()

train_sr_50 = train_data_50.build_full_trainset()
val_sr_before_50 = val_data_20.build_full_trainset()
val_sr_50 = val_sr_before_20.build_testset()
test_sr_before_50 = test_data_50.build_full_trainset()
test_sr_50 = test_sr_before_50.build_testset()

train_sr_100 = train_data_100.build_full_trainset()
val_sr_before_100 = val_data_100.build_full_trainset()
val_sr_100 = val_sr_before_100.build_testset()
test_sr_before_100 = test_data_100.build_full_trainset()
test_sr_100 = test_sr_before_100.build_testset()

## Tuning

In [56]:
RMSE_tune = {}

In [51]:
n_epochs = [5, 7, 10]  # the number of iteration of the SGD procedure
lr_all = [0.002, 0.003, 0.005] # the learning rate for all parameters
reg_all =  [0.4, 0.5, 0.6] # the regularization term for all parameters

In [57]:
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr_20)
            predictions = algo.test(val_sr_20)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)

RMSE: 1.4159
RMSE: 1.4173
RMSE: 1.4174
RMSE: 1.4012
RMSE: 1.4016
RMSE: 1.4032
RMSE: 1.3798
RMSE: 1.3807
RMSE: 1.3821
RMSE: 1.4042
RMSE: 1.4047
RMSE: 1.4058
RMSE: 1.3873
RMSE: 1.3885
RMSE: 1.3900
RMSE: 1.3650
RMSE: 1.3665
RMSE: 1.3672
RMSE: 1.3895
RMSE: 1.3903
RMSE: 1.3918
RMSE: 1.3714
RMSE: 1.3735
RMSE: 1.3747
RMSE: 1.3497
RMSE: 1.3508
RMSE: 1.3527


In [58]:
RMSE_tune

{(5, 0.002, 0.4): 1.415933676012248,
 (5, 0.002, 0.5): 1.4173365973852405,
 (5, 0.002, 0.6): 1.417370198820145,
 (5, 0.003, 0.4): 1.4011758726577428,
 (5, 0.003, 0.5): 1.4016120313912452,
 (5, 0.003, 0.6): 1.4031618552800484,
 (5, 0.005, 0.4): 1.3798484285012471,
 (5, 0.005, 0.5): 1.3806636393067282,
 (5, 0.005, 0.6): 1.3821350022120953,
 (7, 0.002, 0.4): 1.4042009585091328,
 (7, 0.002, 0.5): 1.4047180140602467,
 (7, 0.002, 0.6): 1.405780300835095,
 (7, 0.003, 0.4): 1.3872838266494933,
 (7, 0.003, 0.5): 1.3885483890855905,
 (7, 0.003, 0.6): 1.3900448377490269,
 (7, 0.005, 0.4): 1.3649676001324043,
 (7, 0.005, 0.5): 1.3665228974538999,
 (7, 0.005, 0.6): 1.3671758260897764,
 (10, 0.002, 0.4): 1.3894940834924787,
 (10, 0.002, 0.5): 1.3903377177304177,
 (10, 0.002, 0.6): 1.3917867965697626,
 (10, 0.003, 0.4): 1.3714281812046991,
 (10, 0.003, 0.5): 1.3734950396997587,
 (10, 0.003, 0.6): 1.3746840315086657,
 (10, 0.005, 0.4): 1.3497144262970868,
 (10, 0.005, 0.5): 1.3507892432826967,
 (10, 0

## Results

In [80]:
# so the best is when n_epochs = 10, lr_all = 0.005, reg_all = 0.4,
# and the RMSE score is 1.3497
# train and test on the optimal parameter
algo = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4, random_state = 1)

In [81]:
start_time = time.time()
algo.fit(train_sr_20)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b810dc748>

In [82]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 23.70190405845642 seconds ---


In [83]:
predictions_20 = algo.test(test_sr_20)
accuracy.rmse(predictions_20)

RMSE: 1.3985


1.3985173359167238

In [84]:
start_time = time.time()
algo.fit(train_sr_50)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b810dc748>

In [85]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 58.85739994049072 seconds ---


In [86]:
predictions_50 = algo.test(test_sr_50)
accuracy.rmse(predictions_50)

RMSE: 1.3795


1.3794980281447085

In [87]:
start_time = time.time()
algo.fit(train_sr_100)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b810dc748>

In [88]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 122.12709093093872 seconds ---


In [89]:
predictions_100 = algo.test(test_sr_100)
accuracy.rmse(predictions_100)

RMSE: 1.4097


1.4097218976062662

## Evaluate

In [54]:
ratings_train_final_20 = ratings_train_20.append(ratings_val_20)
ratings_train_final_50 = ratings_train_50.append(ratings_val_50)
ratings_train_final_100 = ratings_train_100.append(ratings_val_100)

In [38]:
ratings_entire_df_20 = ratings_train_20.append(ratings_val_20).append(ratings_holdout_20)
ratings_entire_df_50 = ratings_train_50.append(ratings_val_50).append(ratings_holdout_50)
ratings_entire_df_100 = ratings_train_100.append(ratings_val_100).append(ratings_holdout_100)

In [41]:
unique_city_businesses_20 = ratings_entire_df_20[['city','business_id']].drop_duplicates()
unique_cities_20 = unique_city_businesses_20.groupby('city').count()['business_id']
unique_cities_20 = unique_cities_20[unique_cities > 100]
out_20 = pd.DataFrame()
for city in unique_cities_20.index:
    tmp = ratings_holdout_20[(ratings_holdout_20['city'] ==city) &
                              (ratings_holdout_20['rating'] >ratings_holdout_20['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_20 = out_20.append(row)
        
unique_city_businesses_50 = ratings_entire_df_50[['city','business_id']].drop_duplicates()
unique_cities_50 = unique_city_businesses_50.groupby('city').count()['business_id']
unique_cities_50 = unique_cities_50[unique_cities > 100]
out_50 = pd.DataFrame()
for city in unique_cities.index:
    tmp = ratings_holdout_50[(ratings_holdout_50['city'] ==city) &
                              (ratings_holdout_50['rating'] >ratings_holdout_50['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_50 = out_50.append(row)
        
unique_city_businesses_100 = ratings_entire_df_100[['city','business_id']].drop_duplicates()
unique_cities_100 = unique_city_businesses_100.groupby('city').count()['business_id']
unique_cities_100 = unique_cities_100[unique_cities > 100]
out_100 = pd.DataFrame()
for city in unique_cities.index:
    tmp = ratings_holdout_100[(ratings_holdout_100['city'] ==city) &
                              (ratings_holdout_100['rating'] >ratings_holdout_100['average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out_100 = out_100.append(row)

In [43]:
predict_df_20 = out_20[['user_id','city','state']]
predict_df_20 = predict_df_20.merge(unique_city_businesses_20, on = 'city')
predict_df_20['predictions'] = 25

predict_df_50 = out_50[['user_id','city','state']]
predict_df_50 = predict_df_50.merge(unique_city_businesses_50, on = 'city')
predict_df_50['predictions'] = 25

predict_df_100 = out_50[['user_id','city','state']]
predict_df_100 = predict_df_100.merge(unique_city_businesses_100, on = 'city')
predict_df_100['predictions'] = 25

In [47]:
eval_20 = Dataset.load_from_df(predict_df_20[['user_id','business_id','predictions']], reader)
eval_50 = Dataset.load_from_df(predict_df_50[['user_id','business_id','predictions']], reader)
eval_100 = Dataset.load_from_df(predict_df_100[['user_id','business_id','predictions']], reader)

In [48]:
eval_before_20 = eval_20.build_full_trainset()
eval_sr_20 = eval_before_20.build_testset()
algo.fit(train_sr_20)
eval_pred_20 = algo.test(eval_sr_20)
#accuracy.rmse(predictions_20)
baseline_20 = pd.DataFrame(eval_20, columns = ['userId','itemId','rating','pred_rating','x'])
predict_df_20['predictions'] = baseline_20.pred_rating

eval_before_50 = eval_50.build_full_trainset()
eval_sr_50 = eval_before_50.build_testset()
algo.fit(train_sr_50)
eval_pred_50 = algo.test(eval_sr_50)
#accuracy.rmse(predictions_50)
baseline_50 = pd.DataFrame(eval_50, columns = ['userId','itemId','rating','pred_rating','x'])

eval_before_100 = eval_100.build_full_trainset()
eval_sr_100 = eval_before_100.build_testset()
algo.fit(train_sr_100)
eval_pred_100 = algo.test(eval_sr_100)
#accuracy.rmse(predictions_100)
baseline_100 = pd.DataFrame(eval_100, columns = ['userId','itemId','rating','pred_rating','x'])

In [56]:
def get_all_metrics(predict_df, validation_subsample, ratings_train_final):
    top_10_recs = predict_df.groupby(['user_id','city'])['predictions'].nlargest(10).reset_index()
    out = validation_subsample
    cnt =0
    serendipity = 0
    
    
    for row in out.iterrows():
        row_values = row[1]
        top_10 = predict_df.loc[top_10_recs[top_10_recs['user_id'] == row_values['user_id']].level_2]['business_id']
        ###In top 10
        if row_values['business_id'] in top_10.values:
            cnt+=1
        user_history = ratings_train_final[ratings_train_final['user_id'] == row_values['user_id']]    
        been_there = [i for i in top_10.values if i in  user_history.business_id.values]
        serendipity += 1-len(been_there)/10
    
    top_10 = cnt/len(out)
    serendipity = serendipity/len(out)
    
    predict_df = predict_df.reset_index()
    
    analysis_df = predict_df.merge(top_10_recs, left_on = ['user_id','city','index'], right_on = ['user_id','city','level_2'])
    
    coverage = (analysis_df.groupby('city')['business_id'].nunique()/50).values.mean()
    
    predict_df['rankings']=predict_df.groupby(['city','user_id'])['predictions'].rank("first",ascending = False)
    running_rankings =0
    for row in out.iterrows():
        row_values = row[1]
        user_recs = predict_df[(predict_df['user_id']==row_values['user_id'])
                            &(predict_df['city']==row_values['city'])
                             & (predict_df['business_id']==row_values['business_id'])
                              ]
        assert len(user_recs)==1
        running_rankings += user_recs['rankings'].sum()

    avg_rank = running_rankings / len(out)
    print(top_10, coverage, serendipity, avg_rank)
    
    return top_10, coverage, serendipity, avg_rank

In [57]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_20, out_20, ratings_train_final_20)


0.13095238095238096 0.48214285714285715 0.9723809523809518 513.2166666666667


In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_50, out_50, ratings_train_final_50)

In [None]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df_100, out_100, ratings_train_final_100)

## CMF

In [68]:
# universal X_train_20, 50, 100
X_train_20 = ratings_train_20.iloc[:, 0:3]
X_train_20.columns = ['UserId','ItemId','Rating']

X_train_50 = ratings_train_50.iloc[:, 0:3]
X_train_50.columns = ['UserId','ItemId','Rating']

X_train_100 = ratings_train_100.iloc[:, 0:3]
X_train_100.columns = ['UserId','ItemId','Rating']

In [69]:
# universal X_test_20, 50, 100
X_test_20 = ratings_holdout_20.iloc[:, 0:3]
X_test_20.columns = ['UserId','ItemId','Rating']

X_test_50 = ratings_holdout_50.iloc[:, 0:3]
X_test_50.columns = ['UserId','ItemId','Rating']

X_test_100 = ratings_holdout_100.iloc[:, 0:3]
X_test_100.columns = ['UserId','ItemId','Rating']

In [70]:
# universal X_val_20
X_val_20 = ratings_val_20.iloc[:,0:3]
X_val_20.columns = ['UserId','ItemId','Rating']

### 1. State Average Rating

In [72]:
# get state average rating
state_avg_20 = pd.DataFrame(ratings_train_20.groupby("state").rating.mean())
state_avg_20.columns = ['state_avg']
train_state_avg_20 = ratings_train_20.merge(state_avg_20, on = "state")

state_avg_50 = pd.DataFrame(ratings_train_50.groupby("state").rating.mean())
state_avg_50.columns = ['state_avg']
train_state_avg_50 = ratings_train_50.merge(state_avg_50, on = "state")

state_avg_100 = pd.DataFrame(ratings_train_100.groupby("state").rating.mean())
state_avg_100.columns = ['state_avg']
train_state_avg_100 = ratings_train_100.merge(state_avg_100, on = "state")

In [73]:
# item additional info: state average
item_avg_20 = train_state_avg_20.loc[:,['business_id','state_avg']]
item_avg_20.columns = ['ItemId','state_avg']

item_avg_50 = train_state_avg_50.loc[:,['business_id','state_avg']]
item_avg_50.columns = ['ItemId','state_avg']

item_avg_100 = train_state_avg_100.loc[:,['business_id','state_avg']]
item_avg_100.columns = ['ItemId','state_avg']

In [74]:
tune = {}

In [75]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [None]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_state_avg_20))
        prediction = model_1_tune.predict(X_val_20.user_id, X_val_20.business_id)
        X_val_20['pred_rating'] = prediction
        tune[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.rating)**2))

### 1.2 Results

In [None]:
# get the best param
model = CMF(w_main = m, w_item = i, random_seed = 1)

In [None]:
# 20%
start_time = time.time()
model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_state_avg_20))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
state_prediction_20 = model.predict(X_test_20.user_id, X_test_20.business_id)
X_test_20['pred_rating'] = prediction
print('RMSE (with 20% data): ', np.sqrt(np.mean((X_test_20.pred_rating - X_test_20.rating)**2)))

In [None]:
# 50%
start_time = time.time()
model.fit(ratings = deepcopy(X_train_50), item_info = deepcopy(item_state_avg_50))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
state_prediction_50 = model_1_tune.predict(X_test_50.user_id, X_test_50.business_id)
X_test_50['pred_rating'] = prediction
print('RMSE (with 50% data): ', np.sqrt(np.mean((X_test_50.pred_rating - X_test_50.rating)**2)))

In [None]:
# 100%
start_time = time.time()
model.fit(ratings = deepcopy(X_train_100), item_info = deepcopy(item_state_avg_100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
state_prediction_100 = model.predict(X_test_100.user_id, X_test_100.business_id)
X_test_100['pred_rating'] = prediction
print('RMSE (with 100% data): ', np.sqrt(np.mean((X_test_100.pred_rating - X_test_100.rating)**2)))

### 2. State Location

In [None]:
# get state average rating
state_avg_20 = pd.DataFrame(ratings_train_20.groupby("state").rating.mean())
state_avg_20.columns = ['state_avg']
ratings_train_20 = ratings_train_20.merge(state_avg, on = "state")

state_avg_50 = pd.DataFrame(ratings_train_50.groupby("state").rating.mean())
state_avg_50.columns = ['state_avg']
ratings_train_50 = ratings_train_50.merge(state_avg, on = "state")

state_avg_100 = pd.DataFrame(ratings_train_100.groupby("state").rating.mean())
state_avg_100.columns = ['state_avg']
ratings_train_100 = ratings_train_100.merge(state_avg, on = "state")

In [None]:
# make X_train
X_train_20 = ratings_train_20.iloc[:, 0:3]
X_train_20.columns = ['UserId','ItemId','Rating']

In [None]:
# item additional info: state average
item_state_avg_20 = ratings_train.loc[:,['business_id','state_avg']]
item_state_avg_20.columns = ['ItemId','state_avg']

In [None]:
# validation set
X_val_20 = ratings_val_20.iloc[:,0:3]

In [None]:
tune_1 = {}

In [None]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [None]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_state_avg_20))
        prediction = model_1_tune.predict(X_val_20.user_id, X_val_20.business_id)
        X_val_20['pred_rating'] = prediction
        tune_1[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.rating)**2))

In [None]:
model_2.fit(ratings = deepcopy(X_train), item_info = deepcopy(Y_train),\
            cols_bin_item=[cl for cl in Y_train.columns if cl != 'ItemId'])

### 3. User Average Rating

In [None]:
# get state average rating
state_avg_20 = pd.DataFrame(ratings_train_20.groupby("state").rating.mean())
state_avg_20.columns = ['state_avg']
ratings_train_20 = ratings_train_20.merge(state_avg, on = "state")

state_avg_50 = pd.DataFrame(ratings_train_50.groupby("state").rating.mean())
state_avg_50.columns = ['state_avg']
ratings_train_50 = ratings_train_50.merge(state_avg, on = "state")

state_avg_100 = pd.DataFrame(ratings_train_100.groupby("state").rating.mean())
state_avg_100.columns = ['state_avg']
ratings_train_100 = ratings_train_100.merge(state_avg, on = "state")

In [None]:
# make X_train
X_train_20 = ratings_train_20.iloc[:, 0:3]
X_train_20.columns = ['UserId','ItemId','Rating']

In [None]:
# item additional info: state average
item_state_avg_20 = ratings_train.loc[:,['business_id','state_avg']]
item_state_avg_20.columns = ['ItemId','state_avg']

In [None]:
# validation set
X_val_20 = ratings_val_20.iloc[:,0:3]

In [None]:
tune_1 = {}

In [None]:
w_main = [0.5, 5.0, 10.0] # weight assign to the MRSE in factorization of the ratings matrix
w_item = [0.5, 5.0, 10.0][::-1] # weight assign to the MRSE in factorization of the item attributes matrix

In [None]:
# tuning
for m in w_main:
    for i in w_item:
        model = CMF(w_main = m, w_item = i, random_seed = 1)
        model.fit(ratings = deepcopy(X_train_20), item_info = deepcopy(item_state_avg_20))
        prediction = model_1_tune.predict(X_val_20.user_id, X_val_20.business_id)
        X_val_20['pred_rating'] = prediction
        tune_1[m,i] = np.sqrt(np.mean((X_val_20.pred_rating - X_val_20.rating)**2))

## test

In [None]:
# 100% data
user_id_unique = reviews.user_id.unique()
user_id_sample = pd.DataFrame(user_id_unique, columns=['unique_user_id'])
ratings_sample = reviews.merge(user_id_sample, left_on = 'user_id', right_on = 'unique_user_id').drop(['unique_user_id'], axis = 1)
print(ratings_sample.head())
print(ratings_sample.shape)