# Load Data

In [1]:
from recom_system.algorithms.preprocessing import get_ratings_datasets


trainset, testset = get_ratings_datasets(drop_zero=True)
print(f'{len(trainset.raw_ratings)=}, {len(testset)=}')


len(trainset.raw_ratings)=203789, len(testset)=159296


# Test Different Algorithms

In [2]:
def get_best_rmse(gs):
    ind = gs.cv_results['params'].index(gs.best_params['rmse'])
    rmse_list = [gs.cv_results[f'split{i}_test_rmse'][ind]
                 for i in range(gs.cv)]
    return rmse_list
    

## Baseline

In [22]:
from surprise import BaselineOnly
from surprise.model_selection import cross_validate

bl = BaselineOnly()
rmse_bl = cross_validate(bl, trainset, ['rmse'], cv=5)['test_rmse']

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [23]:
rmse_bl

array([0.86033128, 0.86676564, 0.8668429 , 0.86708573, 0.85935786])

## SVD++

In [2]:
from surprise import SVDpp
from surprise.model_selection import GridSearchCV

params = {
    'n_factors': [10, 20, 50, 100],
    'n_epochs': [20,50]
}

gs_svd = GridSearchCV(SVDpp, params, ['rmse'], cv=5)
gs_svd.fit(trainset)
print(f'{gs_svd.best_params=}, {gs_svd.best_score=}')

gs_svd.best_params={'rmse': {'n_factors': 10, 'n_epochs': 20}}, gs_svd.best_score={'rmse': 0.8447353864996305}


In [14]:
svd = gs_svd.best_estimator['rmse']
rmse_svd = get_best_rmse(gs_svd)
rmse_svd

[0.8442899720894412,
 0.8445154797507701,
 0.8456591974789509,
 0.8480926083100451,
 0.8411196748689452]

In [24]:
from scipy.stats import ttest_ind
ttest_ind(rmse_bl, rmse_svd)

TtestResult(statistic=9.344784099032337, pvalue=1.4044851104745064e-05, df=8.0)

## Item-Based KNN

In [26]:
from surprise import KNNWithMeans


params = {
    "k": [10, 20, 50, 100],
    "min_k": [1, 3, 5, 10],
    "sim_options": {
        "user_based": [False],
        "name": ["msd", "cosine"],
    },
    "verbose": [False],
}

gs_knn = GridSearchCV(KNNWithMeans, params, ['rmse'], cv=5)
gs_knn.fit(trainset)
print(f'{gs_knn.best_params=}, {gs_knn.best_score=}')

gs_knn.best_params={'rmse': {'k': 50, 'min_k': 3, 'sim_options': {'user_based': False, 'name': 'cosine'}, 'verbose': False}}, gs_knn.best_score={'rmse': 0.8881769686063841}


In [27]:
knn = gs_knn.best_estimator['rmse']
rmse_knn = get_best_rmse(gs_knn)
rmse_knn

[0.8812840700925273,
 0.8886634986669966,
 0.8943996467965902,
 0.8860823510734597,
 0.8904552764023467]

In [28]:
ttest_ind(rmse_bl, rmse_knn)

TtestResult(statistic=-8.621531274956766, pvalue=2.5385653406704257e-05, df=8.0)

## Uniform-Weighted Hybird

In [4]:
from surprise import SVDpp, KNNWithMeans
from recom_system.algorithms.models.weighted_model import WeightedModel
from surprise.model_selection import cross_validate


uwm = WeightedModel([
    KNNWithMeans(k=50, min_k=3, verbose=False, sim_options={'user_based': False, 'name': 'cosine'}),
    SVDpp(n_factors=10, n_epochs=20),
], auto_weight=0)
rmse_uwm = cross_validate(uwm, trainset, ['rmse'], cv=5)['test_rmse']
rmse_uwm

array([0.8539115 , 0.84764434, 0.84759155, 0.85218755, 0.84825958])

## Auto-Weighted Hybird

In [None]:
awm = WeightedModel([
    KNNWithMeans(k=50, min_k=3, verbose=False, sim_options={'user_based': False, 'name': 'cosine'}),
    SVDpp(n_factors=10, n_epochs=20),
], auto_weight=1000)
rmse_awm = cross_validate(awm, trainset, ['rmse'], cv=5)['test_rmse']
rmse_awm