## Factorization Model


Notebook covers feature selection and hyperparameter tuning for the model.


In [1]:
#%load_ext nb_black
%load_ext autoreload
%autoreload 2

In [15]:
import pandas as pd
import numpy as np
import scipy
from source.fm import FMRecommender
from source.utils import train_test_split_feature, rmse
from sklearn.metrics import mean_absolute_error
import pickle
from timeit import default_timer as timer

In [3]:
feature = pd.read_csv("data/feature.csv")
feature.shape

(1814134, 85)

In [4]:
# helper functions 

def create_x_y(train, test):
    y_col = "review_stars"
    drop_cols = [y_col, "review_date"] # don't need to use review_date as it is not numeric
    X_train, y_train = train.drop(columns=drop_cols), train[y_col]
    X_test, y_test = test.drop(columns=drop_cols), test[y_col]

    return X_train, X_test, y_train, y_test

def run_fm_model(model, param, y_test, return_pred=False):
    
    start_fit = timer()
    model.fit(param)
    end_fit = timer()
    runtime_fit = end_fit - start_fit

    start_pred = timer()
    y_pred = model.predict()
    end_pred = timer()
    runtime_test = end_pred - start_pred

    res_rmse = rmse(y_test, y_pred)
    res_mae = mean_absolute_error(y_test, y_pred)
    
    if return_pred:
        return res_rmse, res_mae, runtime_fit, runtime_test, y_pred
    
    return res_rmse, res_mae, runtime_fit, runtime_test

In [5]:
# store this for feature selection
rsmes, maes, fit_times, test_times, features = [], [], [], [], []

In [13]:
# run this section repeatedly for different feature combinations

feat_cols = [
        'review_props',
        "user_review_count",
        "user_elite",
        'user_fans',
        "user_average_stars",
        'user_compliment',
        'user_yelping_years',
#        "business_stars",
#        "business_review_count",
        "American (New)",
        "American (Traditional)",
        "Arts & Entertainment",
        "Asian Fusion",
        "Bagels",
        "Bakeries",
        "Barbeque",
        "Bars",
        "Beer",
        "Beer Bar",
        "Breakfast & Brunch",
        "Breweries",
        "Buffets",
        "Burgers",
        "Cafes",
        "Caterers",
        "Chicken Wings",
        "Chinese",
        "Cocktail Bars",
        "Coffee & Tea",
        "Comfort Food",
        "Delis",
        "Desserts",
        "Diners",
        "Ethnic Food",
        "Event Planning & Services",
        "Fast Food",
        "Food",
        "Food Delivery Services",
        "French",
        "Gastropubs",
        "Gluten-Free",
        "Greek",
        "Hawaiian",
        "Hot Dogs",
        "Hotels",
        "Hotels & Travel",
        "Ice Cream & Frozen Yogurt",
        "Indian",
        "Italian",
        "Japanese",
        "Juice Bars & Smoothies",
        "Korean",
        "Latin American",
        "Lounges",
        "Mediterranean",
        "Mexican",
        "Middle Eastern",
        "Music Venues",
        "Nightlife",
        "Noodles",
        "Pizza",
        "Pubs",
        "Salad",
        "Sandwiches",
        "Seafood",
        "Soup",
        "Southern",
        "Specialty Food",
        "Sports Bars",
        "Steakhouses",
        "Sushi Bars",
        "Tacos",
        "Tapas/Small Plates",
        "Tex-Mex",
        "Thai",
        "Vegan",
        "Vegetarian",
        "Venues & Event Spaces",
        "Vietnamese",
        "Wine & Spirits",
        "Wine Bars",
]

base_cols = ["user_id", "business_id", "review_stars", "review_date"]

selected_feature = feature[base_cols + feat_cols].copy()
train, test = train_test_split_feature(selected_feature)
X_train, X_test, y_train, y_test = create_x_y(train, test)
feature_cols = str(list(X_train.columns))

fm = FMRecommender(X_train, X_test, y_train, y_test, ['user_id', 'business_id'])
param = {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20} # defaults

res_rsme, res_mae, fit_time, test_time = run_fm_model(fm, param, y_test)

rsmes.append(res_rsme) 
maes.append(res_mae)
fit_times.append(fit_time)
test_times.append(test_time)
features.append(feature_cols)

# display current run
pd.DataFrame({"test_rmse": res_rsme, "test_mae": res_mae, 
                                "fit_time": fit_time, "test_time": test_time,
                                "feature": feature_cols}, index=[0])

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,feature
0,1.300622,1.064521,12.563575,0.196977,"['user_id', 'business_id', 'review_props', 'us..."


In [53]:
# collect
df_feature_result = pd.DataFrame({"test_rmse": rsmes, "test_mae": maes, 
                                "fit_time": fit_times, "test_time": test_times,
                                "feature": features})

# pickle
result_pickle = "result/fm_feature_selection.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_feature_result, handle)

## Hyperparameter tuning

In [11]:
rsmes, maes, fit_times, test_times = [], [], [], []

params= [ # test k
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':10},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':30},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':40},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':50},
          # test lr
         {'task':'reg', 'lr':0.01,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.05,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.1,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.5,'lambda':0.00002, 'metric':'rmse', 'k':20},
          # test lambda
         {'task':'reg', 'lr':0.2,'lambda':0, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.0002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.02, 'metric':'rmse', 'k':20},
         # test epoch
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':5},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':10},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':50},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':100}]

fm = FMRecommender(X_train, X_test, y_train, y_test, ['user_id', 'business_id']) # uses the final feature fm_model

for param in params:
    res_rmse, res_mae, fit_time, test_time = run_fm_model(fm, param, y_test)
    rsmes.append(res_rmse) 
    maes.append(res_mae)
    fit_times.append(fit_time)
    test_times.append(test_time)

df_hyper_result = pd.DataFrame({"test_rmse": rsmes, "test_mae": maes, 
                                "fit_time": fit_times, "test_time": test_times,
                                "param": params})
df_hyper_result

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,param
0,1.300469,1.062513,8.456074,0.122145,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
1,1.300818,1.065048,12.096625,0.131176,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
2,1.300754,1.065014,15.89668,0.134656,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
3,1.300929,1.065146,18.932358,0.158445,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
4,1.30052,1.063261,21.826286,0.156667,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
5,1.346648,1.114708,12.048062,0.134017,"{'task': 'reg', 'lr': 0.01, 'lambda': 2e-05, '..."
6,1.300736,1.065127,12.480548,0.144755,"{'task': 'reg', 'lr': 0.05, 'lambda': 2e-05, '..."
7,1.298897,1.062603,12.592768,0.128174,"{'task': 'reg', 'lr': 0.1, 'lambda': 2e-05, 'm..."
8,1.300625,1.064856,12.295545,0.137128,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
9,1.307612,1.071484,12.032161,0.124518,"{'task': 'reg', 'lr': 0.5, 'lambda': 2e-05, 'm..."


In [24]:
# pickle
result_pickle = "result/fm_hyperparameter_tuning.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_hyper_result, handle)

## Final Model

In [30]:
fm = FMRecommender(X_train, X_test, y_train, y_test, ['user_id', 'business_id']) # uses the final feature fm_model
param = {'task':'reg', 'lr':0.2,'lambda':0, 'metric':'rmse', 'k':20} # from above

res_rsme, res_mae, fit_time, test_time, y_pred = run_fm_model(fm, param, y_test, return_pred=True)

# display current run
pd.DataFrame({"test_rmse": res_rsme, "test_mae": res_mae, 
                                "fit_time": fit_time, "test_time": test_time,
                                "feature": feature_cols}, index=[0])

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,feature
0,1.293612,1.053238,12.435787,0.151073,"['user_id', 'business_id', 'review_props', 'us..."


In [31]:
# picking final rating result 
df_fm_pred = pd.DataFrame({"user_id": X_test["user_id"], "business_id": X_test["business_id"], "y_pred": y_pred})

result_pickle = "result/fm_pred.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_fm_pred, handle)