In [1]:
import pickle
import warnings
from pprint import pprint

import pandas as pd
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools import Columns
from rectools.dataset import Interactions
from rectools.metrics import MAP, calc_metrics
from rectools.metrics import Precision, Recall, NDCG, Serendipity
from rectools.model_selection import TimeRangeSplitter

from models.userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 200)

# Read data

In [2]:
interactions_df = pd.read_csv("../data/interactions.csv")
users = pd.read_csv("../data/users.csv")
items = pd.read_csv("../data/items.csv")

interactions_df.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

test = True
if test:
    interactions = Interactions(interactions_df.sample(frac=0.01))
else:
    interactions = Interactions(interactions_df)

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
3729402,467027,6209,2021-08-01,6660.0,100.0
2440373,933494,3734,2021-07-18,5854.0,100.0
907207,201382,10440,2021-06-09,121364.0,90.0
3003571,292510,16135,2021-05-08,400.0,5.0
2260007,1071515,12463,2021-04-07,3290.0,53.0


# Hyperparameters tuning

In [3]:
N_SPLITS = 4
TEST_SIZE = "7D"

In [4]:
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [5]:
thresholds: tuple[int, ...] = (10,)

precision_metrics = {f"Precision@{k}": Precision(k=k) for k in thresholds}
recall_metrics = {f"Recall@{k}": Recall(k=k) for k in thresholds}
map_metrics = {f"MAP@{k}": MAP(k=k, divide_by_k=False) for k in thresholds}
ndcg_metrics = {f"NDCG@{k}": NDCG(k=k, log_base=3) for k in thresholds}
serendipity_metrics = {f"Serendipity@{k}": Serendipity(k=k) for k in thresholds}

metrics = {**precision_metrics, **recall_metrics, **map_metrics, **ndcg_metrics, **serendipity_metrics}

In [6]:
models = {"cosine_userknn": CosineRecommender(), "tfidf_userknn": TFIDFRecommender(), "BM25_userknn": BM25Recommender()}

In [7]:
def print_in_a_frame(*words):
    size = max(len(word) for word in words)
    print("*" * (size + 4))
    for word in words:
        print("* {:<{}} *".format(word, size))
    print("*" * (size + 4))

In [8]:
results = []
fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print_in_a_frame(f"Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()
    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        model_tmp = UserKnn(model=model, N_users=30)
        model_tmp.fit(df_train)

        reco = model_tmp.predict(df_test)

        metric_values = calc_metrics(
            metrics,
            reco=reco,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)

**********
* Fold 0 *
**********
{'end': Timestamp('2021-08-02 00:00:00'),
 'i_split': 0,
 'start': Timestamp('2021-07-26 00:00:00'),
 'test': 301,
 'test_items': 251,
 'test_users': 291,
 'train': 38888,
 'train_items': 4472,
 'train_users': 35343}


  0%|          | 0/35343 [00:00<?, ?it/s]

  0%|          | 0/35343 [00:00<?, ?it/s]

  0%|          | 0/35343 [00:00<?, ?it/s]

**********
* Fold 1 *
**********
{'end': Timestamp('2021-08-09 00:00:00'),
 'i_split': 1,
 'start': Timestamp('2021-08-02 00:00:00'),
 'test': 374,
 'test_items': 295,
 'test_users': 363,
 'train': 42718,
 'train_items': 4624,
 'train_users': 38747}


  0%|          | 0/38747 [00:00<?, ?it/s]

  0%|          | 0/38747 [00:00<?, ?it/s]

  0%|          | 0/38747 [00:00<?, ?it/s]

**********
* Fold 2 *
**********
{'end': Timestamp('2021-08-16 00:00:00'),
 'i_split': 2,
 'start': Timestamp('2021-08-09 00:00:00'),
 'test': 387,
 'test_items': 296,
 'test_users': 374,
 'train': 46596,
 'train_items': 4807,
 'train_users': 42128}


  0%|          | 0/42128 [00:00<?, ?it/s]

  0%|          | 0/42128 [00:00<?, ?it/s]

  0%|          | 0/42128 [00:00<?, ?it/s]

**********
* Fold 3 *
**********
{'end': Timestamp('2021-08-23 00:00:00'),
 'i_split': 3,
 'start': Timestamp('2021-08-16 00:00:00'),
 'test': 432,
 'test_items': 340,
 'test_users': 426,
 'train': 50587,
 'train_items': 4952,
 'train_users': 45609}


  0%|          | 0/45609 [00:00<?, ?it/s]

  0%|          | 0/45609 [00:00<?, ?it/s]

  0%|          | 0/45609 [00:00<?, ?it/s]

In [9]:
metrics_df = pd.DataFrame(results)
metrics_df

Unnamed: 0,fold,model,Precision@10,Recall@10,NDCG@10,MAP@10,Serendipity@10
0,0,cosine_userknn,0.000344,0.003436,0.000326,0.000859,5e-06
1,0,tfidf_userknn,0.000344,0.003436,0.000326,0.000859,5e-06
2,0,BM25_userknn,0.000344,0.003436,0.000326,0.000859,5e-06
3,1,cosine_userknn,0.0,0.0,0.0,0.0,0.0
4,1,tfidf_userknn,0.0,0.0,0.0,0.0,0.0
5,1,BM25_userknn,0.0,0.0,0.0,0.0,0.0
6,2,cosine_userknn,0.0,0.0,0.0,0.0,0.0
7,2,tfidf_userknn,0.0,0.0,0.0,0.0,0.0
8,2,BM25_userknn,0.0,0.0,0.0,0.0,0.0
9,3,cosine_userknn,0.0,0.0,0.0,0.0,0.0


In [10]:
metrics_df.groupby("model").mean()[metrics.keys()]

Unnamed: 0_level_0,Precision@10,Recall@10,MAP@10,NDCG@10,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BM25_userknn,8.6e-05,0.000859,0.000215,8.1e-05,1e-06
cosine_userknn,8.6e-05,0.000859,0.000215,8.1e-05,1e-06
tfidf_userknn,8.6e-05,0.000859,0.000215,8.1e-05,1e-06


As we can see, the best results produces the model with tfidf

# Training the best model

In [11]:
userknn = UserKnn(TFIDFRecommender(), N_users=30)

In [12]:
userknn.fit(interactions.df)

  0%|          | 0/49240 [00:00<?, ?it/s]

# Inference

In [13]:
print(userknn.recommend(interactions.df.head(2).user_id.values[0], k_recs=10))
print(userknn.recommend(interactions.df.head(2).user_id.values[0], k_recs=5))

[6209, 3558, 2848, 3018, 12324, 10440, 15297, 9728, 13865, 4151]
[6209, 3558, 2848, 3018, 12324]


In [14]:
print(userknn.recommend(1e10, k_recs=5))

[10440, 15297, 9728, 13865, 4151]


# Save model

In [15]:
pickle.dump(userknn, open("../model_weights/userknn_model.pkl", "wb"))