# Training using cross-validation on RecTools model

Here, I train and evaluate diffrent models on each split, using [RecTools models and metrics implementation](https://rectools.readthedocs.io/en/stable/features.html).

In [71]:
import warnings
warnings.filterwarnings("ignore")

## Data reading

In [72]:
K = 10
data_interim_dir = '../data/interim/'
data_filenames = [f'u{t}.{split}' for t in ['1', '2', '3', '4', '5'] for split in ['base', 'test']]
data_filenames

['u1.base',
 'u1.test',
 'u2.base',
 'u2.test',
 'u3.base',
 'u3.test',
 'u4.base',
 'u4.test',
 'u5.base',
 'u5.test']

In [73]:
import pickle


data = {}
for i in range(0, len(data_filenames), 2):
    base_filename, test_filename = data_filenames[i:i+2]
    data_title = base_filename.split('.')[0]
    with open(data_interim_dir + base_filename + '.pickle', 'rb') as base:
        with open(data_interim_dir + test_filename + '.pickle', 'rb') as test:
            with open(data_interim_dir + base_filename + '.df.pickle', 'rb') as base_df:
                with open(data_interim_dir + test_filename + '.df.pickle', 'rb') as test_df:
                    data[data_title] = (pickle.load(base), pickle.load(test), pickle.load(base_df), pickle.load(test_df))

data.keys()

dict_keys(['u1', 'u2', 'u3', 'u4', 'u5'])

## Performing 5-fold cross-validation on `u1-u5` splits

In [74]:
def load_data(fold):
    return data[f'u{fold}']

In [75]:
from rectools import Columns
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from torch.utils.data import Dataset


def train_and_validate(model, fold_data):
    train, test, train_df, test_df = fold_data
    
    model.fit(train)
    
    recos = model.recommend(
        users=train_df[Columns.User].unique(),
        dataset=train,
        k=K,
        filter_viewed=True,
    )
    recos.rename(columns={Columns.Score: Columns.Weight}, inplace=True)
    # recos[Columns.Weight] = recos[Columns.Weight] / recos[Columns.Weight].max()
    # test_df[Columns.Weight] = test_df[Columns.Weight] / test_df[Columns.Weight].max()

    merged_data = pd.merge(recos, test_df, on=[Columns.User, Columns.Item], suffixes=('_predicted', '_test'))
    rmse = np.sqrt(mean_squared_error(merged_data[Columns.Weight + '_test'], merged_data[Columns.Weight + '_predicted']))
    
    return rmse

In [76]:
def cross_validation(model, model_name, num_folds=5):
    print(model_name)
    
    rmse_values = []
    for fold in range(1, num_folds + 1):
        fold_data = load_data(fold)
        fold_rmse = train_and_validate(model, fold_data)
        rmse_values.append(fold_rmse)
        print(f"RMSE (Fold {fold}): {fold_rmse}")

    average_rmse = np.mean(rmse_values)
    print(f"Average RMSE (across all folds): {average_rmse}\n")

In [77]:
from lightfm import LightFM
from rectools.models import PureSVDModel, LightFMWrapperModel, ImplicitALSWrapperModel
from implicit.als import AlternatingLeastSquares


factors = 10  # Fine-tuned
model_svd = PureSVDModel()
model_als = ImplicitALSWrapperModel(
        AlternatingLeastSquares(factors=factors)
        )
model_lfm = LightFMWrapperModel(
    model=LightFM(no_components=factors, k=K),
    epochs=1,  # Fine-tuned
    )

In [78]:
cross_validation(model_svd, 'PureSVDModel')

PureSVDModel
RMSE (Fold 1): 2.4658047475626756
RMSE (Fold 2): 2.1509806944901597
RMSE (Fold 3): 2.0662228478092013
RMSE (Fold 4): 2.1081674185791384
RMSE (Fold 5): 2.1526314937317834
Average RMSE (across all folds): 2.188761440434592



In [79]:
cross_validation(model_als, 'AlternatingLeastSquares')

AlternatingLeastSquares


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RMSE (Fold 1): 2.5455352082405946


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RMSE (Fold 2): 2.41550356258027


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RMSE (Fold 3): 2.34581772317709


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RMSE (Fold 4): 2.355701457569609


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RMSE (Fold 5): 2.366631073473502
Average RMSE (across all folds): 2.4058378050082134



In [80]:
cross_validation(model_lfm, 'LightFM')

LightFM
RMSE (Fold 1): 29.742396830999827
RMSE (Fold 2): 29.072428425994143
RMSE (Fold 3): 29.66901760596288
RMSE (Fold 4): 29.43798959140094
RMSE (Fold 5): 29.456258651151142
Average RMSE (across all folds): 29.475618221101787



### Choosing the best model

According to Average RMSE, `PureSVD` outperforms other models having smallest RMSE value of `2.18`. The simplicity might be a key.

In [81]:
best_model = PureSVDModel()

### Model saving

In [82]:
model_path = '../models/best_model.pickle'

with open(model_path, 'wb') as pickle_file:
    pickle.dump(best_model, pickle_file)