# Evaluation of Best Model

On the evaluation step, I calculate metrics for `PureSVD` model on the A and B splits of the dataset. Also, I average metrics across splits.

In [84]:
import warnings
warnings.filterwarnings("ignore")

## Load test data

In [85]:
K = 10
data_interim_dir = '../benchmark/data/'
data_filenames = [f'u{t}.{split}' for t in ['a', 'b'] for split in ['base', 'test']]
data_filenames

['ua.base', 'ua.test', 'ub.base', 'ub.test']

In [86]:
import pickle


data = {}
for i in range(0, len(data_filenames), 2):
    base_filename, test_filename = data_filenames[i:i+2]
    data_title = base_filename.split('.')[0]
    with open(data_interim_dir + base_filename + '.pickle', 'rb') as base:
        with open(data_interim_dir + test_filename + '.pickle', 'rb') as test:
            with open(data_interim_dir + base_filename + '.df.pickle', 'rb') as base_df:
                with open(data_interim_dir + test_filename + '.df.pickle', 'rb') as test_df:
                    data[data_title] = (pickle.load(base), pickle.load(test), pickle.load(base_df), pickle.load(test_df))

data.keys()

dict_keys(['ua', 'ub'])

## Load best model 

In [87]:
model_path = f'../models/best_model.pickle'

with open(model_path, 'rb') as pickle_file:
    best_model = pickle.load(pickle_file)

## Test on A and B splits

I compute Classification and Ranking metrics [implemeted in RecTools](https://rectools.readthedocs.io/en/stable/api/rectools.metrics.html), namely:
- F1Beta
- Normalized DCG
- Mean Average Precision
- Mean Reciprocal Rank
- Serendipity (added for the interest)

For all metrics, $k=10$ is used.

In [88]:
from rectools.metrics.scoring import calc_metrics
from rectools.metrics import F1Beta, NDCG, MAP, MRR, Serendipity


metrics = {
    f'F1Beta@{K}': F1Beta(k=K),  # Classification
    f'NDCG@{K}': NDCG(k=K, log_base=3),  # Ranking
    f'MAP@{K}': MAP(k=K),  # Ranking
    f'MRR@{K}': MRR(k=K),  # Ranking
    f'Serendipity@{K}': Serendipity(k=K),  # Serendipity: novelty and relevance together
}

In [89]:
def load_data(fold):
    return data[f'u{fold}']

In [90]:
from rectools import Columns


def train_and_evaluate(model, fold_data):
    train, test, train_df, test_df = fold_data
    model.fit(train)
    recos = model.recommend(
        users=train_df[Columns.User].unique(),
        dataset=train,
        k=K,
        filter_viewed=True,
    )
    return calc_metrics(
        metrics,
        reco=recos,
        interactions=test_df,
        prev_interactions=train_df,
        catalog=train_df[Columns.Item].unique()
    )

In [91]:
from tabulate import tabulate


def print_metrics_table(metrics_dict):
    table = []

    for metric_name, metric_value in metrics_dict.items():
        table.append([metric_name, metric_value])

    print(tabulate(table))


def test_model(model, folds=['a', 'b']):
    total_metrics = {k: 0 for k in metrics.keys()}
    for fold in folds:
        fold_data = load_data(fold)
        fold_metrics = train_and_evaluate(model, fold_data)
        for metric_name, metric_value in fold_metrics.items():
            total_metrics[metric_name] += metric_value
        
        print(f"Fold {fold}:")
        print_metrics_table(fold_metrics)
        print()
    
    average_metrics = {metric_name: total_value / len(folds) for metric_name, total_value in total_metrics.items()}
    print(f"Average across test folds:")
    print_metrics_table(average_metrics)

In [92]:
test_model(best_model)

Fold a:
--------------  ----------
F1Beta@10       0.242842
NDCG@10         0.285174
MRR@10          0.613322
MAP@10          0.149509
Serendipity@10  0.00599713
--------------  ----------

Fold b:
--------------  ----------
F1Beta@10       0.231601
NDCG@10         0.273182
MRR@10          0.592713
MAP@10          0.141989
Serendipity@10  0.00599345
--------------  ----------

Average across test folds:
--------------  ----------
F1Beta@10       0.237222
NDCG@10         0.279178
MAP@10          0.145749
MRR@10          0.603017
Serendipity@10  0.00599529
--------------  ----------
