# Load Data

In [1]:
from recom_system.algorithms.preprocessing import get_ratings_datasets


trainset, testset = get_ratings_datasets(train_size=1, drop_zero=True)
print(f'{len(trainset.raw_ratings)=}, {len(testset)=}')


len(trainset.raw_ratings)=363099, len(testset)=0


In [2]:
type(trainset)

surprise.dataset.DatasetAutoFolds

In [3]:
type(testset)

numpy.ndarray

# Test Collaborative Filtering

In [4]:
def get_best_rmse(gs):
    ind = gs.cv_results['params'].index(gs.best_params['rmse'])
    rmse_list = [gs.cv_results[f'split{i}_test_rmse'][ind]
                 for i in range(gs.cv)]
    return rmse_list
    

## Baseline

In [5]:
from surprise import BaselineOnly
from surprise.model_selection import cross_validate

bl = BaselineOnly()
rmse_bl = cross_validate(bl, trainset, ['rmse'], cv=5)['test_rmse']

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [6]:
rmse_bl

array([0.83660741, 0.84195114, 0.84396108, 0.84140521, 0.84390871])

## SVD++

In [7]:
from surprise import SVDpp
from surprise.model_selection import GridSearchCV

params = {
    'n_factors': [10, 20, 50, 100],
    'n_epochs': [20,50]
}

gs_svd = GridSearchCV(SVDpp, params, ['rmse'], cv=5)
gs_svd.fit(trainset)
print(f'{gs_svd.best_params=}, {gs_svd.best_score=}')

gs_svd.best_params={'rmse': {'n_factors': 10, 'n_epochs': 20}}, gs_svd.best_score={'rmse': 0.8132215061738538}


In [8]:
svd = gs_svd.best_estimator['rmse']
rmse_svd = get_best_rmse(gs_svd)
rmse_svd

[0.8146200027489624,
 0.8172127646868833,
 0.808631607487773,
 0.8114388992765034,
 0.8142042566691474]

In [9]:
from scipy.stats import ttest_ind
ttest_ind(rmse_bl, rmse_svd)

TtestResult(statistic=14.254526522254828, pvalue=5.717430717204586e-07, df=8.0)

## Item-Based KNN

In [10]:
from surprise import KNNWithMeans


params = {
    "k": [10, 20, 50, 100],
    "min_k": [1, 3, 5, 10],
    "sim_options": {
        "user_based": [False],
        "name": ["msd", "cosine"],
    },
    "verbose": [False],
}

gs_knn = GridSearchCV(KNNWithMeans, params, ['rmse'], cv=5)
gs_knn.fit(trainset)
print(f'{gs_knn.best_params=}, {gs_knn.best_score=}')

gs_knn.best_params={'rmse': {'k': 50, 'min_k': 3, 'sim_options': {'user_based': False, 'name': 'msd'}, 'verbose': False}}, gs_knn.best_score={'rmse': 0.8425375086483211}


In [11]:
knn = gs_knn.best_estimator['rmse']
rmse_knn = get_best_rmse(gs_knn)
rmse_knn

[0.8396249518778547,
 0.8430932263728895,
 0.8435954958025824,
 0.8460033611640593,
 0.8403705080242191]

In [12]:
ttest_ind(rmse_bl, rmse_knn)

TtestResult(statistic=-0.548702366692953, pvalue=0.5981852303434684, df=8.0)

## Uniform-Weighted Hybird

In [2]:
from surprise import SVDpp, KNNWithMeans
from recom_system.algorithms.models.weighted_model import WeightedModel
from recom_system.algorithms.metrics import cross_validate


uwm = WeightedModel([
    KNNWithMeans(k=50, min_k=3, verbose=False, sim_options={'user_based': False, 'name': 'cosine'}),
    SVDpp(n_factors=10, n_epochs=20),
], auto_weight=0)
results_uwm = cross_validate(uwm, trainset, ['rmse', 'ndcg', 'precision', 'recall'], cv=5)
results_uwm


{'rmse': [0.8116279355789171,
  0.8155381790913532,
  0.8175472324546061,
  0.8120552477066912,
  0.8159643986699098],
 'ndcg': [0.9864696716016154,
  0.9861374214588686,
  0.986326994957316,
  0.9863317694677463,
  0.9862839438422476],
 'precision': [0.7534726589972348,
  0.7525880351110188,
  0.7496458604635187,
  0.7516870769788034,
  0.7482616783513326],
 'recall': [0.8627822120048141,
  0.8615380704208155,
  0.8596899890200173,
  0.8604084109770549,
  0.8576410455287731]}

## Auto-Weighted Hybird

In [3]:
from surprise import SVDpp, KNNWithMeans
from recom_system.algorithms.models.weighted_model import WeightedModel
from recom_system.algorithms.metrics import cross_validate


awm = WeightedModel([
    KNNWithMeans(k=50, min_k=3, verbose=False, sim_options={'user_based': False, 'name': 'cosine'}),
    SVDpp(n_factors=10, n_epochs=20),
], auto_weight=10000, fixed=True)
results_awm = cross_validate(awm, trainset, ['rmse', 'ndcg', 'precision', 'recall'], cv=5)
results_awm


{'rmse': [0.8153002721536519,
  0.8196694829611489,
  0.8104597306297214,
  0.8106200597272974,
  0.8127296926242688],
 'ndcg': [0.9863718048688914,
  0.9861065180173668,
  0.9864163466030706,
  0.9863246726012378,
  0.9865553639363059],
 'precision': [0.7477900136036049,
  0.7483615141809986,
  0.7534534197882525,
  0.7540491273949885,
  0.7532174084895978],
 'recall': [0.8562262287667164,
  0.857213946621636,
  0.8614071089484295,
  0.8632261452751331,
  0.8615843021130591]}

## Dynamic-Weighted Hybird

In [None]:
from surprise import SVDpp, KNNWithMeans
from recom_system.algorithms.models.weighted_model import WeightedModel
from recom_system.algorithms.metrics import cross_validate


dwm = WeightedModel([
    KNNWithMeans(k=50, min_k=3, verbose=False, sim_options={'user_based': False, 'name': 'cosine'}),
    SVDpp(n_factors=10, n_epochs=20),
], auto_weight=1000, fixed=False)
results_dwm = cross_validate(dwm, trainset, ['rmse', 'ndcg', 'precision', 'recall'], cv=5)
results_dwm


# Test Content Based Algorithms

## Sentence Embeddings from Title and Description

In [1]:
from recom_system.algorithms.io import get_books

books = get_books()

In [11]:
from recom_system.algorithms.models.vector_based.sentence_embeddings import build_item_matrix
from recom_system.algorithms.models.vector_based import VectorBasedModel
from recom_system.algorithms.metrics import precision_recall, ndcg

%time vbm = VectorBasedModel(build_item_matrix(books))

CPU times: user 3.32 s, sys: 257 ms, total: 3.58 s
Wall time: 3.58 s


KeyError: 'UKN__197899'