https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

https://gist.github.com/susanli2016/e0cdcf1bca69a2b144fd8c04f30b522f#file-benchmark-py

In [1]:
# import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate

In [4]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')

In [3]:
# first do robustscaler to minimize outliers
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [5]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [6]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [7]:
# import surprise algos for benchmark
from surprise import SVD, SlopeOne, SVDpp, KNNBasic, KNNBaseline, NormalPredictor, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

In [None]:
benchmark = []

In [14]:
# Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

In [13]:
second_batch_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
second_batch_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,1.381702,2.285744,4.085708
KNNBaseline,1.383627,62.850965,329.816368
KNNWithMeans,1.386997,60.903173,322.992544
KNNBasic,1.389609,61.683086,323.258707
CoClustering,1.404148,22.748989,4.350266
KNNWithZScore,1.419576,63.487562,324.48522
