In [1]:
import time
import warnings
from copy import deepcopy
from pathlib import Path

import numpy as np
import pandas as pd
import rectools
from rectools import Columns
from rectools.dataset import Dataset
from rectools.dataset import Interactions
from rectools.metrics import Precision, Recall, MAP, NDCG, calc_metrics, Serendipity, MeanInvUserFreq
from rectools.model_selection import TimeRangeSplitter
from rectools.models import RandomModel, PopularModel
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Read data

In [2]:
interactions_path = Path("../data/interactions.csv")
df_interactions = pd.read_csv(interactions_path)
df_interactions.rename(
    columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True
)
interactions = Interactions(df_interactions)

In [3]:
items_path = Path("../data/items.csv")
items = pd.read_csv(items_path)
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


# Evaluator object

In [4]:
class Evaluator:
    def __init__(self, models: dict, metrics: dict, splitter: TimeRangeSplitter, k: int):
        self.models = models
        self.metrics = metrics
        self.splitter = splitter
        self.k = k
        self.results = pd.DataFrame(columns=["fold", "model", "time"] + list(self.metrics.keys()))

    def evaluate_model(self, model, model_name, dataset, df_train, df_test, test_users, catalog, fold_i):
        model_copy = deepcopy(model)
        start = time.time()
        model_copy.fit(dataset)
        end = time.time()
        recommendations = model_copy.recommend(
            users=test_users,
            dataset=dataset,
            k=self.k,
            filter_viewed=True,
        )

        metric_values = calc_metrics(
            self.metrics,
            reco=recommendations,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
        res = {"fold": fold_i, "model": model_name, "time": end - start}
        res.update(metric_values)

        tmp_df = pd.DataFrame([res])
        self.results = pd.concat([self.results, tmp_df], ignore_index=True)

    def evaluate_fold(self, train_ids, test_ids, fold_info):
        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in self.models.items():
            self.evaluate_model(
                model, model_name, dataset, df_train, df_test, test_users, catalog, fold_info["i_split"]
            )

    def run(self, interactions):
        n_splits = self.splitter.n_splits

        fold_iterator = self.splitter.split(interactions, collect_fold_stats=True)
        pbar = tqdm(fold_iterator, total=n_splits)

        for train_ids, test_ids, fold_info in pbar:
            pbar.set_description(f"Evaluating fold {fold_info['i_split']}")
            self.evaluate_fold(train_ids, test_ids, fold_info)

    def display(self):
        pivot = self.results.drop("fold", axis=1).groupby(["model"]).mean()
        metric_columns = [
            (col.split("@")[0], int(col.split("@")[1])) if "@" in col else (col, "") for col in pivot.columns
        ]
        pivot.columns = pd.MultiIndex.from_tuples(metric_columns, names=["Metric", "Value"])

        display(pivot)
        return pivot

# Models

In [5]:
SEED: int = 42
N_SPLITS: int = 3
K: int = 10

models = {"RandomModel": RandomModel(random_state=SEED), "PopularModel": PopularModel()}

# Metrics

## All metrics

In [6]:
thresholds: tuple[int, ...] = (1, 5, 10)

precision_metrics = {f"Precision@{k}": Precision(k=k) for k in thresholds}

recall_metrics = {f"Recall@{k}": Recall(k=k) for k in thresholds}

map_metrics = {f"MAP@{k}": MAP(k=k, divide_by_k=False) for k in thresholds}

ndcg_metrics = {f"NDCG@{k}": NDCG(k=k, log_base=3) for k in thresholds}

miuf_metrics = {f"MIUF@{k}": MeanInvUserFreq(k=k) for k in thresholds}

serendipity_metrics = {f"Serendipity@{k}": Serendipity(k=k) for k in thresholds}

## Combined metrics

In [7]:
metrics = {**precision_metrics, **recall_metrics, **map_metrics, **ndcg_metrics, **miuf_metrics, **serendipity_metrics}

# Splitter

In [8]:
splitter = TimeRangeSplitter(
    "7D",
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

# Evaluation

In [9]:
evaluator = Evaluator(models, metrics, splitter, K)
evaluator.run(interactions)

Evaluating fold 2: 100%|██████████| 3/3 [01:08<00:00, 22.94s/it]


In [10]:
pivot = evaluator.display()

Metric,time,Precision,Precision,Precision,Recall,Recall,Recall,MAP,MAP,MAP,NDCG,NDCG,NDCG,MIUF,MIUF,MIUF,Serendipity,Serendipity,Serendipity
Value,Unnamed: 1_level_1,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
PopularModel,4.501238,0.076432,0.052402,0.033903,0.04272,0.137413,0.173492,0.04272,0.078295,0.084109,0.076432,0.057932,0.043084,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06
RandomModel,2.4e-05,0.000169,0.000176,0.000176,6.9e-05,0.000345,0.000687,6.9e-05,0.000155,0.0002,0.000169,0.000173,0.000175,15.616147,15.611575,15.610409,4e-06,6e-06,6e-06


# Visual analysis

In [11]:
def print_in_a_frame(*words):
    size = max(len(word) for word in words)
    print("*" * (size + 4))
    for word in words:
        print("* {:<{}} *".format(word, size))
    print("*" * (size + 4))


def visual_analysis(model, dataset, user_ids):
    items_df = items[["item_id", "title", "genres"]]

    for user_id in user_ids:
        mask = dataset.interactions.df.user_id == user_id
        user_history_df = dataset.interactions.df.loc[mask]
        user_recommendations = model.recommend(users=[user_id], dataset=dataset, k=K, filter_viewed=True)
        user_history_df = (
            user_history_df.copy().reset_index().rename({"index": "action_id"}, axis=1).drop("weight", axis=1)
        )
        user_recommendations = user_recommendations.copy()
        user_history = (
            user_history_df.merge(
                items_df,
                left_on="item_id",
                right_on="item_id",
            )
            .sort_values(by="datetime")
            .tail(10)
        )

        print_in_a_frame(f"User {user_id}", "History")
        display(user_history)

        if not user_recommendations.empty:
            user_recos = user_recommendations.merge(
                items_df,
                left_on="item_id",
                right_on="item_id",
            )
            print_in_a_frame(f"User {user_id}", "Recommendations")
            display(user_recos[items_df.columns])
        print("\n", 50 * "=", "\n")

In [12]:
user_ids = (666262, 672861, 955527)
random_state = 42

model = RandomModel(random_state=random_state)
dataset = Dataset.construct(df_interactions)
model.fit(dataset)

visual_analysis(model, dataset, user_ids)

***************
* User 666262 *
* History     *
***************


Unnamed: 0,action_id,user_id,item_id,datetime,title,genres
0,2233832,666262,93,2021-07-21,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы"


*******************
* User 666262     *
* Recommendations *
*******************


Unnamed: 0,item_id,title,genres
0,7419,Ода радости,комедии
1,9109,Последняя битва,"драмы, военные"
2,13917,Преисподняя,"драмы, детективы, триллеры, вестерн"
3,13332,Лихорадка,ужасы
4,1331,Вечность,драмы
5,15448,Леший,"триллеры, криминал, детективы"
6,1123,Богема,"драмы, мюзиклы, мелодрамы"
7,9933,Шедевр,"драмы, комедии"
8,3287,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей"
9,8478,Американский пирог: Все в сборе,комедии



***************
* User 672861 *
* History     *
***************


Unnamed: 0,action_id,user_id,item_id,datetime,title,genres
0,2284513,672861,25,2021-07-26,Медвежонок Винни и его друзья,"мюзиклы, мультфильм, приключения, комедии"
1,4681450,672861,32,2021-08-01,В ритме сердца,"драмы, мюзиклы, мелодрамы"


*******************
* User 672861     *
* Recommendations *
*******************


Unnamed: 0,item_id,title,genres
0,7419,Ода радости,комедии
1,9109,Последняя битва,"драмы, военные"
2,13917,Преисподняя,"драмы, детективы, триллеры, вестерн"
3,13332,Лихорадка,ужасы
4,1331,Вечность,драмы
5,15448,Леший,"триллеры, криминал, детективы"
6,1123,Богема,"драмы, мюзиклы, мелодрамы"
7,9933,Шедевр,"драмы, комедии"
8,3287,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей"
9,8478,Американский пирог: Все в сборе,комедии



***************
* User 955527 *
* History     *
***************


Unnamed: 0,action_id,user_id,item_id,datetime,title,genres
0,5376325,955527,21,2021-07-20,Признание 5,для взрослых


*******************
* User 955527     *
* Recommendations *
*******************


Unnamed: 0,item_id,title,genres
0,7419,Ода радости,комедии
1,9109,Последняя битва,"драмы, военные"
2,13917,Преисподняя,"драмы, детективы, триллеры, вестерн"
3,13332,Лихорадка,ужасы
4,1331,Вечность,драмы
5,15448,Леший,"триллеры, криминал, детективы"
6,1123,Богема,"драмы, мюзиклы, мелодрамы"
7,9933,Шедевр,"драмы, комедии"
8,3287,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей"
9,8478,Американский пирог: Все в сборе,комедии



