# Factorization Model


Notebook covers feature selection and hyperparameter tuning for the factorization model.


## Load dependancies

In [1]:
#%load_ext nb_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy
from source.fm import FMRecommender
from source.utils import train_test_split_feature, rmse, generate_combinations
from sklearn.metrics import mean_absolute_error
import pickle
from timeit import default_timer as timer

In [3]:
feature = pd.read_csv("data/feature.csv")
feature.shape

(1814134, 85)

In [4]:
# helper functions 

def create_x_y(train, test):
    y_col = "review_stars"
    drop_cols = [y_col, "review_date"] # don't need to use review_date as it is not numeric
    X_train, y_train = train.drop(columns=drop_cols), train[y_col]
    X_test, y_test = test.drop(columns=drop_cols), test[y_col]
    return X_train, X_test, y_train, y_test


def run_fm_model(model, param, X_test, y_test, return_pred=False):
    
    start_fit = timer()
    model.fit(param)
    end_fit = timer()
    runtime_fit = end_fit - start_fit

    start_pred = timer()
    y_pred = model.predict(X_test, y_test)
    end_pred = timer()
    runtime_test = end_pred - start_pred

    res_rmse = rmse(y_test, y_pred)
    res_mae = mean_absolute_error(y_test, y_pred)
    
    if return_pred:
        return res_rmse, res_mae, runtime_fit, runtime_test, y_pred
    
    return res_rmse, res_mae, runtime_fit, runtime_test

## Feature Selection

In [5]:
# store this for feature selection
rsmes, maes, fit_times, test_times, feature_cols_list = [], [], [], [], []

In [6]:
# define our categorical columns to use
cat_cols = ['user_id', 'business_id']
cat_universe = np.unique(feature[cat_cols].values.flatten())

In [13]:
# run this section repeatedly for different feature combinations

feat_cols = [
          "user_review_count",
          "user_elite",
          'user_fans',
          "user_average_stars",
          'user_compliment',
          'user_yelping_years',
#        "business_stars",
#        "business_review_count",
        "American (New)",
        "American (Traditional)",
        "Arts & Entertainment",
        "Asian Fusion",
        "Bagels",
        "Bakeries",
        "Barbeque",
        "Bars",
        "Beer",
        "Beer Bar",
        "Breakfast & Brunch",
        "Breweries",
        "Buffets",
        "Burgers",
        "Cafes",
        "Caterers",
        "Chicken Wings",
        "Chinese",
        "Cocktail Bars",
        "Coffee & Tea",
        "Comfort Food",
        "Delis",
        "Desserts",
        "Diners",
        "Ethnic Food",
        "Event Planning & Services",
        "Fast Food",
        "Food",
        "Food Delivery Services",
        "French",
        "Gastropubs",
        "Gluten-Free",
        "Greek",
        "Hawaiian",
        "Hot Dogs",
        "Hotels",
        "Hotels & Travel",
        "Ice Cream & Frozen Yogurt",
        "Indian",
        "Italian",
        "Japanese",
        "Juice Bars & Smoothies",
        "Korean",
        "Latin American",
        "Lounges",
        "Mediterranean",
        "Mexican",
        "Middle Eastern",
        "Music Venues",
        "Nightlife",
        "Noodles",
        "Pizza",
        "Pubs",
        "Salad",
        "Sandwiches",
        "Seafood",
        "Soup",
        "Southern",
        "Specialty Food",
        "Sports Bars",
        "Steakhouses",
        "Sushi Bars",
        "Tacos",
        "Tapas/Small Plates",
        "Tex-Mex",
        "Thai",
        "Vegan",
        "Vegetarian",
        "Venues & Event Spaces",
        "Vietnamese",
        "Wine & Spirits",
        "Wine Bars",
]

base_cols = ["user_id", "business_id", "review_stars", "review_date"]

selected_feature = feature[base_cols + feat_cols].copy()
train, test = train_test_split_feature(selected_feature)
X_train, X_test, y_train, y_test = create_x_y(train, test)
feature_cols = str(list(X_train.columns))

fm = FMRecommender(X_train, y_train, cat_cols, cat_universe)
param = {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20} # defaults

res_rsme, res_mae, fit_time, test_time = run_fm_model(fm, param, X_test, y_test)

rsmes.append(res_rsme) 
maes.append(res_mae)
fit_times.append(fit_time)
test_times.append(test_time)
feature_cols_list.append(feature_cols)

# display current run
pd.DataFrame({"test_rmse": res_rsme, "test_mae": res_mae, 
                                "fit_time": fit_time, "test_time": test_time,
                                "feature": feature_cols}, index=[0])

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,feature
0,1.280347,1.047539,19.998795,8.515026,"['user_id', 'business_id', 'user_review_count'..."


In [14]:
# collect
df_feature_result = pd.DataFrame({"test_rmse": rsmes, "test_mae": maes, 
                                "fit_time": fit_times, "test_time": test_times,
                                "feature": feature_cols_list})

# pickle
result_pickle = "result/fm_feature_selection.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_feature_result, handle)

## Hyperparameter Tuning

In [17]:
rsmes, maes, fit_times, test_times = [], [], [], []

params= [ # test k
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':10},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':30},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':40},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':50},
          # test lr
         {'task':'reg', 'lr':0.1,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.5,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.75,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':1.0,'lambda':0.00002, 'metric':'rmse', 'k':20},
          # test lambda
         {'task':'reg', 'lr':0.2,'lambda':0, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.0002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.02, 'metric':'rmse', 'k':20},
         # test epoch
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':10},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':20},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':50},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':100},
         {'task':'reg', 'lr':0.2,'lambda':0.00002, 'metric':'rmse', 'k':20, 'epoch':250},
         # test combo
         {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':10},
         {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':20},
         {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':50},
         {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':100},
         {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':250}]

for param in params:
    res_rmse, res_mae, fit_time, test_time = run_fm_model(fm, param, X_test, y_test)
    rsmes.append(res_rmse) 
    maes.append(res_mae)
    fit_times.append(fit_time)
    test_times.append(test_time)

df_hyper_result = pd.DataFrame({"test_rmse": rsmes, "test_mae": maes, 
                                "fit_time": fit_times, "test_time": test_times,
                                "param": params})
df_hyper_result

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,param
0,1.259262,1.016648,62.682214,7.6431,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
1,1.259316,1.017994,86.361038,7.888054,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
2,1.259315,1.01735,113.995706,7.976557,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
3,1.259316,1.016759,130.555321,7.795728,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
4,1.259416,1.018464,150.388042,7.834874,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
5,1.267125,1.030667,86.209928,7.737456,"{'task': 'reg', 'lr': 0.1, 'lambda': 2e-05, 'm..."
6,1.259254,1.016405,86.954927,7.736577,"{'task': 'reg', 'lr': 0.2, 'lambda': 2e-05, 'm..."
7,1.267567,1.010567,87.517413,7.827638,"{'task': 'reg', 'lr': 0.5, 'lambda': 2e-05, 'm..."
8,1.280925,1.014828,87.487292,7.806414,"{'task': 'reg', 'lr': 0.75, 'lambda': 2e-05, '..."
9,1.294549,1.021358,87.822787,7.877346,"{'task': 'reg', 'lr': 1.0, 'lambda': 2e-05, 'm..."


In [18]:
# pickle
result_pickle = "result/fm_hyperparameter_tuning.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_hyper_result, handle)

## Output Final Model Predictions

In [20]:
param = {'task':'reg', 'lr':0.5,'lambda':0, 'metric':'rmse', 'k':20, 'epoch':20} # best from above

res_rsme, res_mae, fit_time, test_time, y_pred = run_fm_model(fm, param, X_test, y_test, return_pred=True)

# display current run
pd.DataFrame({"test_rmse": res_rsme, "test_mae": res_mae, 
                                "fit_time": fit_time, "test_time": test_time,
                                "feature": feature_cols}, index=[0])

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,feature
0,1.257405,1.011706,23.195233,7.530269,"['user_id', 'business_id', 'user_review_count'..."


In [21]:
# picking final rating result 
df_fm_pred = pd.DataFrame({"user_id": X_test["user_id"], "business_id": X_test["business_id"], "y_pred": y_pred})

result_pickle = "result/fm_pred.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_fm_pred, handle)

## Generate Top 20 Business Recommendations by Rating Per User

In [22]:
user_features = ['user_id','user_review_count','user_elite','user_fans', 'user_average_stars','user_compliment',
                 'user_yelping_years']
biz_features = ['business_id',"American (New)","American (Traditional)","Arts & Entertainment","Asian Fusion",
                "Bagels","Bakeries","Barbeque","Bars","Beer","Beer Bar","Breakfast & Brunch","Breweries","Buffets",
                "Burgers","Cafes","Caterers","Chicken Wings","Chinese","Cocktail Bars","Coffee & Tea","Comfort Food",
                "Delis","Desserts","Diners","Ethnic Food","Event Planning & Services","Fast Food","Food",
                "Food Delivery Services","French","Gastropubs","Gluten-Free","Greek","Hawaiian","Hot Dogs",
                "Hotels","Hotels & Travel","Ice Cream & Frozen Yogurt","Indian","Italian","Japanese",
                "Juice Bars & Smoothies","Korean","Latin American","Lounges","Mediterranean","Mexican",
                "Middle Eastern","Music Venues","Nightlife","Noodles","Pizza","Pubs","Salad","Sandwiches",
                "Seafood","Soup","Southern","Specialty Food","Sports Bars","Steakhouses","Sushi Bars",
                "Tacos","Tapas/Small Plates","Tex-Mex","Thai","Vegan","Vegetarian","Venues & Event Spaces",
                "Vietnamese","Wine & Spirits","Wine Bars",]

user_map = feature[user_features].groupby('user_id').first().reset_index()
biz_map = feature[biz_features].groupby('business_id').first().reset_index()

In [23]:
# generate top 20 recommendations per user batch

df_fm_top_20 = pd.DataFrame()

for sample in generate_combinations(feature):

    # get new X test set to predict
    X_test = sample

    # enrich with features (FM model only)
    X_test = X_test.merge(user_map, 'left', 'user_id')
    X_test = X_test.merge(biz_map, 'left', 'business_id')

    # generate dummy target values (not assessing accuracy)
    y_test = np.zeros(len(X_test))

    # generate predictions
    y_pred = pd.Series(fm.predict(X_test, y_test))
    y_pred.name = 'y_pred'
    
    # add pred to dataset
    user_pred_ratings = X_test.merge(y_pred, 'inner', left_index=True, right_index=True)
    
    # keep top 20 
    users_top_20 = user_pred_ratings.sort_values('y_pred', ascending=False).groupby('user_id').head(20).reset_index()
    users_top_20_output = users_top_20[['user_id', 'business_id']]
    print(users_top_20_output.head(1)) # debug print first recommendation result
    
    # save batch
    df_fm_top_20 = pd.concat([df_fm_top_20, users_top_20_output])


                  user_id             business_id
0  pa7aFqUw5c9IByZ_8oUUQQ  NCFwm2-TDb-oBQ2medmYDg
                  user_id             business_id
0  Z0KhMsGNb6xYXhVKfPjP0w  VG0nWxGsPixYLsyi49gyxQ
                  user_id             business_id
0  cgxiTbbkNg2BBuKfQwvXOw  VG0nWxGsPixYLsyi49gyxQ
                  user_id             business_id
0  WxCd4ylcND1BpTKTVBiu1Q  NCFwm2-TDb-oBQ2medmYDg
                  user_id             business_id
0  Wkb8b9QJ35XTp-KYO0ojBQ  VG0nWxGsPixYLsyi49gyxQ
                  user_id             business_id
0  YB7JmVufE4A5y9eOfCzGXg  VG0nWxGsPixYLsyi49gyxQ
                  user_id             business_id
0  QnZxPYjsjsfUNHs7gIdKBw  NCFwm2-TDb-oBQ2medmYDg
                  user_id             business_id
0  RLF-23SkGFD56rn8HGo-wQ  NCFwm2-TDb-oBQ2medmYDg
                  user_id             business_id
0  1bMk965fGtZkqewtHGkTdA  NCFwm2-TDb-oBQ2medmYDg
                  user_id             business_id
0  _gyixAjqnoJ2wlJjh-_fFg  NCFwm2-TDb-oBQ2medmYDg


In [24]:
# pickle top 20 coverage set
result_pickle = "result/fm_top_20.pkl"
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_fm_top_20, handle)