# Building recommendations with RecTools


- Building simple model
- Visual recommendations checking

In [None]:
!pip install implicit
!pip install rectools
!pip install rectools[lightfm]
!pip install rectools[torch]

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.2
Collecting rectools
  Downloading rectools-0.5.0-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, rectools
Successfully installed rectools-0.5.0 typeguard-2.13.3
Collecting lightfm<=1.17,>=1.16 (from rectools[lightfm])
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels f

In [None]:
import numpy as np
import pandas as pd
import os
import threadpoolctl
import torch
from tqdm import tqdm
import operator
from collections import Counter

from rectools import Columns
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares
from rectools.models import (
    PopularModel,
    ImplicitItemKNNWrapperModel,
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    DSSMModel
)
from rectools.metrics import (
    IntraListDiversity,
    Serendipity,
    Accuracy,
    Precision,
    Recall,
    MRR,
    NDCG,
    calc_metrics
)
from rectools.models.dssm import DSSM
from rectools.dataset.torch_datasets import DSSMDataset

from implicit.nearest_neighbours import TFIDFRecommender
from lightfm import LightFM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from rectools.metrics.distances import PairwiseHammingDistanceCalculator


# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x7b1d62d0a9e0>

# Load data

In [None]:
 ! mkdir ~/.kaggle
 ! cp kaggle.json ~/.kaggle/
 ! chmod 600 ~/.kaggle/kaggle.json
 ! kaggle datasets download -d dschettler8845/recsys-2020-ecommerce-dataset

Downloading recsys-2020-ecommerce-dataset.zip to /content
 98% 617M/629M [00:07<00:00, 96.9MB/s]
100% 629M/629M [00:07<00:00, 83.9MB/s]


In [None]:
import zipfile
zip_file_path = 'recsys-2020-ecommerce-dataset.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
data_train = pd.read_parquet('train.parquet')
data_val = pd.read_parquet('val.parquet')
data_test = pd.read_parquet('test.parquet')

In [None]:
RANDOM_STATE = 32

# Data prep

### Useful functions

In [None]:
def interactions_prep(df):
    interactions = df[df['target'] == 1]\
                      [['user_id', 'product_id', 'target', 'timestamp']] \
                      .groupby(['user_id', 'product_id', 'timestamp'], as_index=False)\
                      .aggregate({'target': 'sum'})

    interactions.rename(columns={'product_id': 'item_id',
                                   'target': 'weight',
                                   'timestamp': 'datetime'}, inplace=True)
    return interactions


def full_category(cat0_cat1_cat2_cat3):
    cat0, cat1, cat2, cat3 = cat0_cat1_cat2_cat3
    if cat3 != 'NA':
      return f'{cat0}.{cat1}.{cat2}.{cat3}'
    if cat2 != 'NA':
      return f'{cat0}.{cat1}.{cat2}'
    if cat1 != 'NA':
      return f'{cat0}.{cat1}'
    return cat0

### Prepare user-item interactions for the analysis

In [None]:
train_interactions = interactions_prep(data_train)
val_interactions = interactions_prep(data_val)
test_interactions = interactions_prep(data_test)

train_interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight
0,101875240,100019252,2020-01-16 18:18:22,1
1,107620212,20500418,2020-01-30 15:10:12,1
2,128968633,15800015,2019-12-31 11:30:56,1
3,128968633,2900516,2019-12-31 10:09:41,1
4,136662675,1005008,2019-12-13 09:00:26,1


In [None]:
train_users = set(train_interactions[Columns.User].unique())
val_users = set(val_interactions[Columns.User].unique())
test_users = set(test_interactions[Columns.User].unique())
# users for metrics calculation
users = val_users.intersection(train_users)
test_users_in_train = test_users.intersection(train_users)
users.update(test_users_in_train)

In [None]:
val_interactions_warm = val_interactions[val_interactions.user_id.isin(users)].copy()

### Prepare sparce feature matrix

- Can be used with iALS, LightFM and DSSM

- ***Item features***: brand, category
- ***User features***: average purchase price and total number of purchase (both will be converted to categorical features using 10 quantiles)

In [None]:
data_train['full_cat'] = data_train[['cat_0', 'cat_1', 'cat_2', 'cat_3']]\
                                      .apply(full_category, axis=1)
data_train['price'] = data_train['price'].apply(float)

#### Sparce matrix for items by brand and full category

In [None]:
item_info = data_train[data_train['target'] == 1]\
                 .groupby(['product_id'], as_index=False)\
                 .aggregate({'brand': 'last', 'cat_0': 'last',
                             'full_cat': 'last'})\
                 .rename(columns={'product_id': 'item_id'})
item_info.head()

Unnamed: 0,item_id,brand,cat_0,full_cat
0,100000000,xlmedia,computers,computers.components.cooler
1,100000002,istarikomiks,computers,computers.components.cooler
2,100000003,xlmedia,,
3,100000010,,furniture,furniture.universal.light
4,100000016,,apparel,apparel.scarf


In [None]:
# Let's prepare a flatten dataframe with 2 item features

def flatten_df(df: pd.DataFrame, id_name: str, features: list):
  features_frames = []
  for feature in features:
      feature_frame = df.reindex(columns=[id_name, feature])
      feature_frame.columns = ['id', 'value']
      feature_frame['feature'] = feature
      features_frames.append(feature_frame)
  return pd.concat(features_frames)

item_features = flatten_df(item_info, id_name='item_id', features=['full_cat', 'brand']) # 'brand'
# Let's see how this looks for items 1,2
item_features.query("id in ['100000000', '100000002']").sort_values("id")

Unnamed: 0,id,value,feature
0,100000000,computers.components.cooler,full_cat
0,100000000,xlmedia,brand
1,100000002,computers.components.cooler,full_cat
1,100000002,istarikomiks,brand


#### Sparce matrix for users by average price of purchase

In [None]:
user_info = data_train[data_train['target'] == 1] \
                        .groupby('user_id', as_index=False) \
                        .aggregate({'price': 'mean'})\
                        .rename(columns={'price': 'avg_purch_price'})

user_info['avg_purch_price_cat'] = pd.qcut(user_info['avg_purch_price'],
                                              q=10, precision=0)

user_info.drop(columns=['avg_purch_price'], inplace=True)
user_info.head()

Unnamed: 0,user_id,avg_purch_price_cat
0,101875240,"(181.0, 230.0]"
1,107620212,"(230.0, 292.0]"
2,128968633,"(80.0, 122.0]"
3,136662675,"(80.0, 122.0]"
4,145611266,"(0.0, 43.0]"


In [None]:
user_features = flatten_df(user_info, id_name='user_id', features=['avg_purch_price_cat'])
user_features.head(5)

Unnamed: 0,id,value,feature
0,101875240,"(181.0, 230.0]",avg_purch_price_cat
1,107620212,"(230.0, 292.0]",avg_purch_price_cat
2,128968633,"(80.0, 122.0]",avg_purch_price_cat
3,136662675,"(80.0, 122.0]",avg_purch_price_cat
4,145611266,"(0.0, 43.0]",avg_purch_price_cat


## Create datasets

In [None]:
dataset = Dataset.construct(train_interactions)

In [None]:
sparse_features_dataset = Dataset.construct(
    train_interactions,
    user_features_df=user_features,  # our flatten dataframe
    item_features_df=item_features,
    cat_user_features=['avg_purch_price_cat'], # these will be one-hot-encoded
    cat_item_features=['full_cat', 'brand'],  # 'brand'
    make_dense_user_features=False  # for `sparse` format
)

## Metrics

- For metric Intra list diversity

In [None]:
cat_dummies = pd.get_dummies(item_info[["item_id", "full_cat", "brand"]].set_index("item_id"),
                             prefix="", prefix_sep="").groupby("item_id").sum()
distance_calculator = PairwiseHammingDistanceCalculator(cat_dummies)
ild_5 = IntraListDiversity(k=5, distance_calculator=distance_calculator)

In [None]:
catalog = item_info.item_id.unique()

metrics = {
    "Precision@10": Precision(k=10),
    "Precision@5": Precision(k=5),
    "Precision@1": Precision(k=1),
    "Recall@10": Recall(k=10),
    "Recall@5": Recall(k=5),
    "Recall@1": Recall(k=5),
    "MRR@10": MRR(k=10),
    "MRR@5": MRR(k=5),
    "NDCG@10": NDCG(k=10),
    "NDCG@5": NDCG(k=5),
    "Serendipity@10": Serendipity(k=10),
    "Serendipity@5": Serendipity(k=5),
    "Serendipity@1": Serendipity(k=1),
    "IntraListDiversity@10": IntraListDiversity(k=5,
                              distance_calculator=distance_calculator)
}

In [None]:
# Parameters
K_RECOS = 10
NUM_THREADS = 32
RANDOM_STATE = 32

# Baseline

## Popular Model

In [None]:
%%time
popular_model = PopularModel()
popular_model.fit(dataset)
popular_recos = popular_model.recommend(
    users=users,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True
)

# popular_recos.to_csv('popular_recos.csv', index=False)

CPU times: user 17.8 s, sys: 453 ms, total: 18.2 s
Wall time: 18.1 s


#### Validation

In [None]:
popular_metrics = calc_metrics(
    metrics,
    reco=popular_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

popular_metrics

In [None]:
popular_metrics_warm = calc_metrics(
    metrics,
    reco=popular_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

popular_metrics_warm

CPU times: user 11.4 s, sys: 522 ms, total: 11.9 s
Wall time: 13 s


#### Test

In [None]:
popular_metrics_test = calc_metrics(
    metrics,
    reco=popular_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

popular_metrics_test

# Collaborative filtering

## KNN Model

- Nearest neighbor: In these type of recommendation systems are recommending based on nearest neighbors, nearest neighbor approach used to find out either similar users or similar products

In [None]:
%%time
# Fit model and generate recommendations for all users
knn_model = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=K_RECOS))
knn_model.fit(dataset)
knn_recos = knn_model.recommend(
    users=users,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

# knn_recos.to_csv('knn_recos.csv', index=False)

CPU times: user 1min, sys: 437 ms, total: 1min
Wall time: 1min 4s


#### Validation

In [None]:
knn_metrics = calc_metrics(
    metrics,
    reco=knn_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

knn_metrics

In [None]:
knn_metrics_warm = calc_metrics(
    metrics,
    reco=knn_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

knn_metrics_warm

#### Test

In [None]:
knn_metrics_test = calc_metrics(
    metrics,
    reco=knn_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

knn_metrics_test

## SVD Model

- SVD is used to decompose the user-item interaction matrix into lower-dimensional representations. By using SVD to perform matrix factorization, collaborative filtering recommender systems can reduce the dimensionality of the user-item matrix and improve the accuracy of the recommendations. SVD is often used in conjunction with other techniques, such as neighborhood-based collaborative filtering or matrix factorization with additional regularization terms, to further improve the performance of the recommender system.

In [None]:
%%time
svd_model = PureSVDModel(factors=32)
svd_model.fit(dataset)
svd_recos = svd_model.recommend(
    users=users,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

CPU times: user 2min 31s, sys: 22.6 s, total: 2min 54s
Wall time: 2min 59s


#### Validation

In [None]:
svd_metrics = calc_metrics(
    metrics,
    reco=svd_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

svd_metrics

In [None]:
svd_metrics_warm = calc_metrics(
    metrics,
    reco=svd_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

svd_metrics_warm

#### Test

In [None]:
svd_metrics_test = calc_metrics(
    metrics,
    reco=svd_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

svd_metrics_test

## iALS Model (with and without user features)

- Matrix factorization: It is basically model based collaborative filtering and matrix factorization
- Can be used with user features



### Find best hyperparameters

In [None]:
# K_RECOS = 10
# NUM_THREADS = 32
# RANDOM_STATE = 32
# ITERATIONS = 10

# def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
#     return ImplicitALSWrapperModel(
#         AlternatingLeastSquares(
#             factors=factors,
#             regularization=regularization,
#             alpha=alpha,
#             random_state=RANDOM_STATE,
#             use_gpu=False,
#             num_threads = NUM_THREADS,
#             iterations=ITERATIONS),
#         fit_features_together = fit_features_together,
#         )

In [None]:
# alphas = [1, 10, 100]
# regularizations = [0.01, 0.1, 0.5]
# factors = [32, 64, 128]

# results = []

# for alpha in alphas:
#     for regularization in regularizations:
#         for n_factors in tqdm(factors):
#             model_name = f"no_features_factors_{n_factors}_alpha_{alpha}_reg_{regularization}"
#             model = make_base_model(factors=n_factors, regularization=regularization, alpha=alpha)
#             model.fit(dataset)
#             recos = model.recommend(
#                 users=users,
#                 dataset=dataset,
#                 k=K_RECOS,
#                 filter_viewed=True,
#             )
#             metric_values = calc_metrics(metrics, recos, val_interactions,
#                                          train_interactions, catalog)
#             metric_values["model"] = model_name
#             results.append(metric_values)

100%|██████████| 3/3 [15:37<00:00, 312.44s/it]
100%|██████████| 3/3 [14:59<00:00, 299.77s/it]
100%|██████████| 3/3 [14:59<00:00, 299.89s/it]
100%|██████████| 3/3 [14:59<00:00, 299.91s/it]
100%|██████████| 3/3 [14:56<00:00, 298.81s/it]
100%|██████████| 3/3 [15:00<00:00, 300.15s/it]
100%|██████████| 3/3 [14:56<00:00, 298.71s/it]
100%|██████████| 3/3 [14:54<00:00, 298.23s/it]
100%|██████████| 3/3 [14:46<00:00, 295.43s/it]


In [None]:
# import operator
# from collections import Counter

# # results = pd.read_csv('ials_models_gs_res.csv')
# # results.set_index('model', inplace=True)

# max_metrics = []
# for metric in metrics.keys():
#   metric_dict = dict(results[metric])
#   max_metrics.append(max(metric_dict.items(), key=operator.itemgetter(1))[0])

# c = Counter(max_metrics)
# c.most_common(3)

[('no_features_factors_32_alpha_10_reg_0.01', 10),
 ('no_features_factors_128_alpha_10_reg_0.5', 3)]

### Fitting model with the best hyperparameters

In [None]:
ials_model = ImplicitALSWrapperModel(AlternatingLeastSquares(
            factors=32,
            regularization=0.01,
            alpha=10,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=10))

%time
ials_model.fit(sparse_features_dataset)

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 7.15 µs




  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
%%time
ials_recos = ials_model.recommend(
    users=users,
    dataset=sparse_features_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

#### Validation

In [None]:
ials_metrics = calc_metrics(
    metrics,
    reco=ials_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

ials_metrics

In [None]:
ials_metrics_warm = calc_metrics(
    metrics,
    reco=ials_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

ials_metrics_warm

#### Test


In [None]:
ials_metrics_test = calc_metrics(
    metrics,
    reco=ials_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

ials_metrics_test

# Hybrid model

## LightFM model

- LightFM is a hybrid recommender system library that combines collaborative filtering and content-based filtering techniques. It belongs to the category of hybrid recommender systems.

- Hybrid recommender systems aim to leverage the strengths of both collaborative filtering and content-based filtering approaches to provide more accurate and diverse recommendations. LightFM incorporates both user-item interactions and additional item or user features to generate recommendations.

- Can be used with user features

### Find best hyperparameters

In [None]:
# factors = [32, 64, 128]

# results = []

# for n_factors in tqdm(factors):
#   model_name = f"no_features_factors_{n_factors}"
#   model = LightFMWrapperModel(LightFM(no_components=n_factors, loss="bpr"),
#                                     num_threads=NUM_THREADS,
#                                     epochs=10)
#   model.fit(dataset)
#   recos = model.recommend(
#                 users=users,
#                 dataset=dataset,
#                 k=K_RECOS,
#                 filter_viewed=True,
#             )
#   metric_values = calc_metrics(metrics,
#                                recos,
#                                val_interactions,
#                                train_interactions,
#                                catalog)
#   metric_values["model"] = model_name
#   results.append(metric_values)

#   # pd.DataFrame(results).to_csv('lightfm_models_gr_res.csv')

100%|██████████| 3/3 [15:38<00:00, 312.81s/it]


In [None]:
# import operator
# from collections import Counter

# results = pd.read_csv('lightfm_models_gr_res.csv')
# results.set_index('model', inplace=True)

# max_metrics = []
# for metric in metrics.keys():
#   metric_dict = dict(results[metric])
#   max_metrics.append(max(metric_dict.items(), key=operator.itemgetter(1))[0])

# c = Counter(max_metrics)
# c.most_common(3)

[('no_features_factors_32', 12), ('no_features_factors_64', 1)]

### Fitting model with the best hyperparameters

In [None]:
%%time
# Fit model and generate recommendations for all users
lightfm_model = LightFMWrapperModel(LightFM(no_components=32, loss="bpr"),
                                    num_threads=NUM_THREADS,
                                    epochs=10)
lightfm_model.fit(sparse_features_dataset)
lightfm_recos = lightfm_model.recommend(
    users=train_interactions[Columns.User].unique(),
    dataset=sparse_features_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

CPU times: user 22min 30s, sys: 3min 18s, total: 25min 48s
Wall time: 15min 21s


#### Validation

In [None]:
lightfm_metrics = calc_metrics(
    metrics,
    reco=lightfm_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

lightfm_metrics

In [None]:
lightfm_metrics_warm = calc_metrics(
    metrics,
    reco=lightfm_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

lightfm_metrics_warm

#### Test

In [None]:
lightfm_metrics_test = calc_metrics(
    metrics,
    reco=lightfm_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

lightfm_metrics_test

# Deep Learning Model

## DSSM Model

In [None]:
user_features_short = user_features.head(100000).copy()
train_interactions_short = train_interactions[
    train_interactions['user_id'].isin(user_features_short.id.unique())].copy()
item_features_short = item_features[item_features.id.isin(train_interactions_short.item_id.unique())]

In [None]:
sparse_features_dataset = Dataset.construct(
    train_interactions_short,
    user_features_df=user_features_short,  # our flatten dataframe
    item_features_df=item_features_short,
    cat_user_features=['avg_purch_price_cat'], # these will be one-hot-encoded
    cat_item_features=['full_cat', 'brand'],  # 'brand'
    make_dense_user_features=False  # for `sparse` format
)

In [None]:
dssm_dataset = DSSMDataset.from_dataset(sparse_features_dataset)
dssm_model_parametrized = DSSM(
    n_factors_user=32,
    n_factors_item=32,
    dim_input_user=sparse_features_dataset.user_features.get_sparse().shape[1],
    dim_input_item=sparse_features_dataset.item_features.get_sparse().shape[1],
    dim_interactions=sparse_features_dataset.get_user_item_matrix().shape[1])

In [None]:
dssm_model = DSSMModel(dssm_dataset,
                       max_epochs=10,
                       model=dssm_model_parametrized,
                       batch_size=2048,
                       dataloader_num_workers=2,
                       trainer_accelerator='cuda'
                 )
dssm_model.fit(sparse_features_dataset)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type    | Params
-------------------------------------
0 | user_net | UserNet | 388 K 
1 | item_net | ItemNet | 1.6 K 
-------------------------------------
389 K     Trainable params
0         Non-trainable params
389 K     Total params
1.559     Total estimated model params size (MB)
  self.pid = os.fork()
/usr/local/lib/python3.10/dist-packages/py

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


<rectools.models.dssm.DSSMModel at 0x7f9442e5a2c0>

In [None]:
dssm_recos = dssm_model.recommend(
    users=user_features_short.id.unique(),#train_interactions[Columns.User].unique(),
    dataset=sparse_features_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

  self.pid = os.fork()


#### Validation

In [None]:
dssm_metrics = calc_metrics(
    metrics,
    reco=dssm_recos,
    interactions=val_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

dssm_metrics

In [None]:
dssm_metrics_warm = calc_metrics(
    metrics,
    reco=dssm_recos,
    interactions=val_interactions_warm,
    prev_interactions=train_interactions,
    catalog=catalog
)

dssm_metrics_warm

#### Test

In [None]:
dssm_metrics_test = calc_metrics(
    metrics,
    reco=dssm_recos,
    interactions=test_interactions,
    prev_interactions=train_interactions,
    catalog=catalog
)

dssm_metrics_test

# Models comparison

#### Saved metrics

#### Metrics (warm + cold)

In [None]:
popular_metrics = {'Precision@10': 0.01083355292835545,
 'Recall@10': 0.03989178612310402,
 'Precision@5': 0.013763918230257662,
 'Recall@5': 0.02527102300125967,
 'Recall@1': 0.02527102300125967,
 'Precision@1': 0.013461547050652755,
 'NDCG@10': 0.011630629840708024,
 'NDCG@5': 0.013576053998800207,
 'MRR@10': 0.02298609033022626,
 'MRR@5': 0.02022697192607911,
 'Serendipity@10': 1.0288058940786565e-07,
 'Serendipity@5': 1.0776679589328814e-07,
 'Serendipity@1': 4.737245772147375e-08}

knn_metrics = {'Precision@10': 0.007914130158063753,
 'Recall@10': 0.02696032886562852,
 'Precision@5': 0.011103766464043737,
 'Recall@5': 0.01917646261743177,
 'Recall@1': 0.01917646261743177,
 'Precision@1': 0.018414717481699174,
 'NDCG@10': 0.009881828159168417,
 'NDCG@5': 0.012648070421590515,
 'MRR@10': 0.02123217301379581,
 'MRR@5': 0.019666074132301904,
 'Serendipity@10': 1.447017171764362e-06,
 'Serendipity@5': 1.6141398280136516e-06,
 'Serendipity@1': 1.8092728693097457e-06}

svd_metrics = {'Precision@10': 0.005285242767879874,
 'Recall@10': 0.01953424221618535,
 'Precision@5': 0.006432868685154333,
 'Recall@5': 0.011828492359200755,
 'Recall@1': 0.011828492359200755,
 'Precision@1': 0.008760724797563166,
 'NDCG@10': 0.005938706805271021,
 'NDCG@5': 0.006910321797568006,
 'MRR@10': 0.012629180892489466,
 'MRR@5': 0.010991959100241037,
 'Serendipity@10': 3.1128306544993442e-06,
 'Serendipity@5': 3.269463160047886e-06,
 'Serendipity@1': 2.993879315378035e-06}

ials_metrics = {'Precision@10': 0.009764534585099401,
 'Recall@10': 0.0352460449607855,
 'Precision@5': 0.012723261142400301,
 'Recall@5': 0.02330070509958561,
 'Recall@1': 0.02330070509958561,
 'Precision@1': 0.019569266225094573,
 'NDCG@10': 0.011578806733467223,
 'NDCG@5': 0.014131758065412479,
 'MRR@10': 0.024757594744352053,
 'MRR@5': 0.022389871235434154,
 'Serendipity@10': 3.0470541213719317e-06,
 'Serendipity@5': 2.864307828809799e-06,
 'Serendipity@1': 2.069384986069282e-06}

lightfm_metrics = {'Precision@10': 0.0006306471279204276,
 'Recall@10': 0.0018364191089365676,
 'Precision@5': 0.0006386865388995834,
 'Recall@5': 0.0009160548355744934,
 'Recall@1': 0.0009160548355744934,
 'Precision@1': 0.00018088674703100086,
 'NDCG@10': 0.0005644653990216057,
 'NDCG@5': 0.0005314044970234617,
 'MRR@10': 0.0008979690632973167,
 'MRR@5': 0.0006599388707009623,
 'Serendipity@10': 1.0221965400543671e-07,
 'Serendipity@5': 7.944524001174221e-08,
 'Serendipity@1': 2.697239352258225e-08}

#### Metrics warm

In [None]:
popular_metrics = {'Precision@10': 0.02686841609712329,
 'Recall@10': 0.09893606608111281,
 'Precision@5': 0.034136047941336116,
 'Recall@5': 0.062674947516123,
 'Recall@1': 0.062674947516123,
 'Precision@1': 0.03338613378822956,
 'NDCG@10': 0.02884525548528422,
 'NDCG@5': 0.033670123754326924,
 'MRR@10': 0.05700806037713555,
 'MRR@5': 0.050165139884282076,
 'Serendipity@10': 1.0288058940786565e-07,
 'Serendipity@5': 1.0776679589328812e-07,
 'Serendipity@1': 4.737245772147375e-08}

knn_metrics = {'Precision@10': 0.019627922948259248,
 'Recall@10': 0.06686461393297993,
 'Precision@5': 0.027538575716960022,
 'Recall@5': 0.04755975996826567,
 'Recall@1': 0.04755975996826567,
 'Precision@1': 0.04567054731548456,
 'NDCG@10': 0.024508032825119694,
 'NDCG@5': 0.03136862127876261,
 'MRR@10': 0.0526581503735153,
 'MRR@5': 0.04877405097643917,
 'Serendipity@10': 7.672680546277769e-06,
 'Serendipity@5': 8.558833646923215e-06,
 'Serendipity@1': 9.593509336405881e-06}

svd_metrics = {'Precision@10': 0.013116851468258804,
 'Recall@10': 0.04848161502574382,
 'Precision@5': 0.015985245411345085,
 'Recall@5': 0.029409196067996413,
 'Recall@1': 0.029409196067996413,
 'Precision@1': 0.021744187334537035,
 'NDCG@10': 0.014744481570243885,
 'NDCG@5': 0.017169806218445494,
 'MRR@10': 0.031362714157889014,
 'MRR@5': 0.02731103529507331,
 'Serendipity@10': 3.111374439311056e-06,
 'Serendipity@5': 3.2689112583234985e-06,
 'Serendipity@1': 2.993879315378035e-06}

ials_metrics = {'Precision@10': 0.0242171317168271,
 'Recall@10': 0.08741411133051598,
 'Precision@5': 0.03155510262857649,
 'Recall@5': 0.05778833999448272,
 'Recall@1': 0.05778833999448272,
 'Precision@1': 0.04853395660024148,
 'NDCG@10': 0.028716728415910043,
 'NDCG@5': 0.03504833164118943,
 'MRR@10': 0.061401588338958875,
 'MRR@5': 0.05552937071457847,
 'Serendipity@10': 3.0470541213719312e-06,
 'Serendipity@5': 2.864307828809798e-06,
 'Serendipity@1': 2.069384986069282e-06}

lightfm_metrics = {'Precision@10': 0.0015640750135693475,
 'Recall@10': 0.004554523624329239,
 'Precision@5': 0.0015840136468868043,
 'Recall@5': 0.002271917869674694,
 'Recall@1': 0.002271917869674694,
 'Precision@1': 0.00044861924964276613,
 'NDCG@10': 0.0013999369656140588,
 'NDCG@5': 0.0013179422518477797,
 'MRR@10': 0.0022270631430494963,
 'MRR@5': 0.0016367217932954502,
 'Serendipity@10': 1.0221965400543671e-07,
 'Serendipity@5': 7.944524001174221e-08,
 'Serendipity@1': 2.697239352258225e-08}

#### Metrics comparison table

In [None]:
popular_metrics['model'] = 'Popular'
knn_metrics['model'] = 'KNN'
svd_metrics['model'] = 'SVD'
ials_metrics['model'] = 'iALS'
lightfm_metrics['model'] = 'LightFM'

experiments = [popular_metrics, knn_metrics, svd_metrics, ials_metrics, lightfm_metrics]

experiments_df = pd.DataFrame.from_records(experiments, index=['model']).T
experiments_df.round(6)

model,Popular,KNN,SVD,iALS,LightFM
Precision@10,0.026868,0.019628,0.013117,0.024217,0.001564
Recall@10,0.098936,0.066865,0.048482,0.087414,0.004555
Precision@5,0.034136,0.027539,0.015985,0.031555,0.001584
Recall@5,0.062675,0.04756,0.029409,0.057788,0.002272
Recall@1,0.062675,0.04756,0.029409,0.057788,0.002272
Precision@1,0.033386,0.045671,0.021744,0.048534,0.000449
NDCG@10,0.028845,0.024508,0.014744,0.028717,0.0014
NDCG@5,0.03367,0.031369,0.01717,0.035048,0.001318
MRR@10,0.057008,0.052658,0.031363,0.061402,0.002227
MRR@5,0.050165,0.048774,0.027311,0.055529,0.001637


In [None]:
experiments_df.reset_index()

In [None]:

pd.melt(experiments_df, id_vars=['A'], value_vars=['B', 'C'])

In [None]:
import plotly.express as px

fig = px.scatter(experiments_df, y="nation", x="count",
                 color="medal", symbol="medal")
fig.update_traces(marker_size=10)
fig.show()

## Check recommendations

In [None]:
# Select random user, see history of views and reco for this user
user_id = '136662675'
user_viewed = train_interactions.query("user_id == @user_id").merge(item_info, on="item_id")
user_recos = recos.query("user_id == @user_id").merge(item_info, on="item_id")

In [None]:
# History, but only films that user likes
user_viewed

Unnamed: 0,user_id,item_id,datetime,weight,brand,cat_0,cat_1,price
0,136662675,1005008,2019-12-13 09:00:26,1,xiaomi,construction,tools,94.98


In [None]:
# Recommendations
user_recos.sort_values("rank")

Unnamed: 0,user_id,item_id,score,rank,brand,cat_0,cat_1,price
0,136662675,1005007,0.092387,1,xiaomi,construction,tools,96.85
1,136662675,1005006,0.091551,2,xiaomi,construction,tools,90.74
2,136662675,1005203,0.073311,3,xiaomi,construction,tools,101.26
3,136662675,1005009,0.063981,4,xiaomi,construction,tools,89.81
4,136662675,17700646,0.05766,5,,apparel,shoes,2.96
5,136662675,1004903,0.039563,6,huawei,construction,tools,114.04
6,136662675,1005195,0.039132,7,xiaomi,construction,tools,102.67
7,136662675,58100195,0.03774,8,organicshop,accessories,bag,2.65
8,136662675,38700061,0.03774,9,lumene,furniture,bedroom,6.14
