https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

https://gist.github.com/susanli2016/e0cdcf1bca69a2b144fd8c04f30b522f#file-benchmark-py

In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate

In [32]:
# import our processed datasets
playcounts = pd.read_pickle('playcounts.pkl')

In [33]:
# subset of the data
playcounts = playcounts.sample(frac=0.05, random_state=1)

In [34]:
# max and min of playcount if you want to use it instead of scaled playcount
min_pc = playcounts['playcount'].max()
max_pc = playcounts['playcount'].min()
playcounts['playcount_scale_2'] = playcounts['playcount'].apply(lambda x: (x-min_pc)/(max_pc-min_pc))


In [35]:
# change playcount_scale to float16
playcounts['playcount_scale_2'] = playcounts['playcount_scale_2'].astype('float16')


In [36]:
playcounts = playcounts[['user_id', 'track_id', 'playcount_scale_2']]

In [37]:
playcounts.dtypes

user_id                int64
track_id               int64
playcount_scale_2    float16
dtype: object

In [38]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(playcounts[['user_id', 'track_id', 'playcount_scale_2']], reader)

In [39]:
# import surprise algos for benchmark
from surprise import SVD, SlopeOne, SVDpp, KNNBasic, KNNBaseline, NormalPredictor, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

In [42]:
benchmark = pd.DataFrame(columns=['Algorithm', 'test_rmse_mean', 'test_rmse_std_dev'])
# Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
for algorithm in [SVD(), SVDpp(), KNNBaseline()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    mean_rmse = results['test_rmse'].mean()
    std_rmse = results['test_rmse'].std()

    algo_df = pd.DataFrame([[algorithm.__class__.__name__, mean_rmse, std_rmse]],
                             columns=['Algorithm', 'test_rmse_mean', 'test_rmse_std_dev'])
    
    benchmark = pd.concat([benchmark, algo_df], ignore_index=True)
    
    
resultset = benchmark.sort_values(by='test_rmse_mean').set_index('Algorithm')

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f53c2485d10>


  benchmark = pd.concat([benchmark, algo_df], ignore_index=True)


<surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f53d0d5e1d0>
<surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f53d0d5d090>
Estimating biases using als...
Computing the msd similarity matrix...


MemoryError: Unable to allocate 376. GiB for an array with shape (224792, 224792) and data type float64

In [41]:
resultset

Unnamed: 0_level_0,test_rmse_mean,test_rmse_std_dev
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
SVDpp,0.153274,0.000745
SVD,0.156274,0.000411
