In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy
from source.fm import FMRecommender
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV
from source.utils import train_test_split_feature, rmse_scorer, rmse
from sklearn.preprocessing import OneHotEncoder

In [3]:
feature = pd.read_csv("data/feature.csv")
feature.shape

(1814134, 85)

In [26]:
# Just sample some data to train quickly
sample_n_users = 5000
users = pd.Series(feature.user_id.unique())
sampled_users = users.sample(sample_n_users, random_state=1)
feature = feature[feature.user_id.isin(sampled_users)].copy().reset_index(drop=True)

In [5]:
# onehot encode user_id and business_id
onehot = OneHotEncoder(sparse=False)
cat_wide = onehot.fit_transform(feature[["user_id", "business_id"]])

user_cols = onehot.categories_[0].tolist()
bus_cols = onehot.categories_[1].tolist()
cat_wide = pd.DataFrame(cat_wide, columns=user_cols + bus_cols)
feature = pd.merge(feature, cat_wide, left_index=True, right_index=True)

In [6]:
def create_x_y(train, test):
    y_col = "review_stars"
    drop_cols = [y_col, "user_id", "review_date"]
    X_train, y_train = train.drop(columns=drop_cols), train[y_col]
    X_test, y_test = test.drop(columns=drop_cols), test[y_col]

    return X_train, X_test, y_train, y_test

In [21]:
feat_cols = [
      'review_props',
        "user_review_count",
        "user_elite",
        'user_fans',
          "user_average_stars",
        'user_compliment',
     'user_yelping_years',
     #   "business_stars",
      #  "business_review_count",
        "American (New)",
        "American (Traditional)",
        "Arts & Entertainment",
        "Asian Fusion",
        "Bagels",
        "Bakeries",
        "Barbeque",
        "Bars",
        "Beer",
        "Beer Bar",
        "Breakfast & Brunch",
        "Breweries",
        "Buffets",
        "Burgers",
        "Cafes",
        "Caterers",
        "Chicken Wings",
        "Chinese",
        "Cocktail Bars",
        "Coffee & Tea",
        "Comfort Food",
        "Delis",
        "Desserts",
        "Diners",
        "Ethnic Food",
        "Event Planning & Services",
        "Fast Food",
        "Food",
        "Food Delivery Services",
        "French",
        "Gastropubs",
        "Gluten-Free",
        "Greek",
        "Hawaiian",
        "Hot Dogs",
        "Hotels",
        "Hotels & Travel",
        "Ice Cream & Frozen Yogurt",
        "Indian",
        "Italian",
        "Japanese",
        "Juice Bars & Smoothies",
        "Korean",
        "Latin American",
        "Lounges",
        "Mediterranean",
        "Mexican",
        "Middle Eastern",
        "Music Venues",
        "Nightlife",
        "Noodles",
        "Pizza",
        "Pubs",
        "Salad",
        "Sandwiches",
        "Seafood",
        "Soup",
        "Southern",
        "Specialty Food",
        "Sports Bars",
        "Steakhouses",
        "Sushi Bars",
        "Tacos",
        "Tapas/Small Plates",
        "Tex-Mex",
        "Thai",
        "Vegan",
        "Vegetarian",
        "Venues & Event Spaces",
        "Vietnamese",
        "Wine & Spirits",
        "Wine Bars",
]

base_cols = ["user_id", "review_stars", "review_date"]

selected_feature = feature[base_cols + user_cols + bus_cols + feat_cols].copy()
train, test = train_test_split_feature(selected_feature)
X_train, X_test, y_train, y_test = create_x_y(train, test)

fm = FMRecommender(X_train, X_test, y_train, y_test)

param = {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20} # defaults

fm.fit(param)
y_pred = fm.predict()
rmse_val = rmse(y_test, y_pred)

result = pd.DataFrame([{"Features": feat_cols, "RMSE": rmse_val}])
result.to_csv("data/feature_selection.csv", mode="a", header=False, index=False)

## Hyperparameter tuning

In [23]:
def fit_param(model, param):
    model.fit(param, './model_dm.out')
    model.setTest(xdm_test)  # Test data
    y_pred = model.predict("./model_dm.out")
    return rmse(y_test, y_pred)

rsmes = []
params= [{'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':10},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':30},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':40},
         {'task':'reg', 'lr':0.1,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.3,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.5,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.0002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.002, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.02, 'metric':'rmse', 'k':20},
         {'task':'reg', 'lr':0.2,'lambda':0.2, 'metric':'rmse', 'k':20}]

for param in params:
    rsmes.append(fit_param(fm_model, param))

results = pd.DataFrame({"Params": params, "RMSE": rsmes})
results.to_csv("data/param_selection.csv", index=False)