In [2]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import numpy as np
import scipy
from fastFM import als
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV
from source.utils import train_test_split_feature, rmse_scorer, rmse
from sklearn.preprocessing import OneHotEncoder

<IPython.core.display.Javascript object>

In [4]:
feature = pd.read_csv("data/feature.csv")
# feature = feature.sample(10000)
feature.shape

(1814134, 85)

<IPython.core.display.Javascript object>

In [22]:
# Just sample some data to train quickly
sample_n_users = 5000
users = pd.Series(feature.user_id.unique())
sampled_users = users.sample(sample_n_users, random_state=1)
feature = feature[feature.user_id.isin(sampled_users)].copy().reset_index(drop=True)

<IPython.core.display.Javascript object>

In [24]:
# onehot encode user_id and business_id
onehot = OneHotEncoder(sparse=False)
cat_wide = onehot.fit_transform(feature[["user_id", "business_id"]])

user_cols = onehot.categories_[0].tolist()
bus_cols = onehot.categories_[1].tolist()
cat_wide = pd.DataFrame(cat_wide, columns=user_cols + bus_cols)
feature = pd.merge(feature, cat_wide, left_index=True, right_index=True)

<IPython.core.display.Javascript object>

In [None]:
# user_wide = pd.get_dummies(feature.user_id, prefix="user_id")
# bus_wide = pd.get_dummies(feature.business_id, prefix="business_id")

# feature = feature.join(user_wide)
# feature = feature.join(bus_wide)

# user_cols = user_wide.columns.to_list()
# bus_cols = bus_wide.columns.to_list()

In [38]:
def create_x_y(train, test):
    y_col = "review_stars"
    drop_cols = [y_col, "user_id", "review_date"]
    X_train, y_train = train.drop(columns=drop_cols), train[y_col]
    X_test, y_test = test.drop(columns=drop_cols), test[y_col]

    X_train = scipy.sparse.csc_matrix(X_train.values)
    X_test = scipy.sparse.csc_matrix(X_test.values)

    return X_train, X_test, y_train, y_test

<IPython.core.display.Javascript object>

In [39]:
feat_cols = [
    #  'review_props',
    #     "user_review_count",
    #     "user_elite",
    #  'user_fans',
    #     "user_average_stars",
    #  'user_compliment',
    #  'user_yelping_years',
    #     "business_stars",
    #     "business_review_count",
    #     "American (New)",
    #     "American (Traditional)",
    #     "Arts & Entertainment",
    #     "Asian Fusion",
    #     "Bagels",
    #     "Bakeries",
    #     "Barbeque",
    #     "Bars",
    #     "Beer",
    #     "Beer Bar",
    #     "Breakfast & Brunch",
    #     "Breweries",
    #     "Buffets",
    #     "Burgers",
    #     "Cafes",
    #     "Caterers",
    #     "Chicken Wings",
    #     "Chinese",
    #     "Cocktail Bars",
    #     "Coffee & Tea",
    #     "Comfort Food",
    #     "Delis",
    #     "Desserts",
    #     "Diners",
    #     "Ethnic Food",
    #     "Event Planning & Services",
    #     "Fast Food",
    #     "Food",
    #     "Food Delivery Services",
    #     "French",
    #     "Gastropubs",
    #     "Gluten-Free",
    #     "Greek",
    #     "Hawaiian",
    #     "Hot Dogs",
    #     "Hotels",
    #     "Hotels & Travel",
    #     "Ice Cream & Frozen Yogurt",
    #     "Indian",
    #     "Italian",
    #     "Japanese",
    #     "Juice Bars & Smoothies",
    #     "Korean",
    #     "Latin American",
    #     "Lounges",
    #     "Mediterranean",
    #     "Mexican",
    #     "Middle Eastern",
    #     "Music Venues",
    #     "Nightlife",
    #     "Noodles",
    #     "Pizza",
    #     "Pubs",
    #     "Salad",
    #     "Sandwiches",
    #     "Seafood",
    #     "Soup",
    #     "Southern",
    #     "Specialty Food",
    #     "Sports Bars",
    #     "Steakhouses",
    #     "Sushi Bars",
    #     "Tacos",
    #     "Tapas/Small Plates",
    #     "Tex-Mex",
    #     "Thai",
    #     "Vegan",
    #     "Vegetarian",
    #     "Venues & Event Spaces",
    #     "Vietnamese",
    #     "Wine & Spirits",
    #     "Wine Bars",
]

base_cols = ["user_id", "review_stars", "review_date"]

selected_feature = feature[base_cols + user_cols + bus_cols + feat_cols].copy()
train, test = train_test_split_feature(selected_feature)
X_train, X_test, y_train, y_test = create_x_y(train, test)

fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=20, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

rmse_val = rmse(y_test, y_pred)

result = pd.DataFrame([{"Features": feat_cols, "RMSE": rmse_val}])
result.to_csv("data/feature_selection.csv", mode="a", header=False, index=False)

"rmse:", rmse_val

('rmse:', 1.455719774152857)

<IPython.core.display.Javascript object>

# Hyperparameter Tuning

In [None]:
params = {
    "l2_reg_w": np.linspace(0.01, 0.3, num=3),
    "l2_reg_V": np.linspace(0.1, 0.5, num=3),
}

fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=20)
grid = GridSearchCV(fm, params, verbose=3, n_jobs=-1, scoring=rmse_scorer)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

"rmse:", rmse(y_test, y_pred)

In [None]:
grid.best_params_