In [2]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [21]:
import pandas as pd
import numpy as np
import pickle

from implicit.als import AlternatingLeastSquares

from rectools.metrics import MAP, calc_metrics
from rectools.models import PopularModel, RandomModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.model_selection import TimeRangeSplitter

from pathlib import Path
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

# Read and Process Data

In [5]:
DATA_PATH = Path("../data")
Columns.Datetime = "last_watch_dt"

In [6]:
users = pd.read_csv(DATA_PATH / "users.csv")
items = pd.read_csv(DATA_PATH / "items.csv")
interactions = pd.read_csv(DATA_PATH / "interactions.csv")

In [7]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [8]:
max_date = interactions[Columns.Datetime].max()

In [9]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [10]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [11]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [12]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [13]:
dataset = Dataset.construct(interactions_df=train)

# Fixed hyperparameters

In [14]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 8
N_FACTORS = (4,)

In [15]:
k = 10

metrics_name = {
    "MAP": MAP,
}

metrics = {f"{metric_name}@{k}": metric(k=k) for metric_name, metric in metrics_name.items()}

# Models

In [16]:
models = {
    "random": RandomModel(random_state=RANDOM_STATE),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
}

In [17]:
implicit_models = {
    "ALS": AlternatingLeastSquares,
    "BPR": BayesianPersonalizedRanking,
}
for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = ImplicitALSWrapperModel(
            model=implicit_model(factors=n_factors, random_state=RANDOM_STATE, num_threads=NUM_THREADS)
        )

In [18]:
lightfm_losses = ("logistic", "bpr", "warp")

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(
                no_components=n_factors,
                loss=loss,
                random_state=RANDOM_STATE,
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

# Brute-force

In [None]:
results = []
for model_name, model in tqdm(models.items()):
    model_quality = {"model": model_name}

    model.fit(dataset)
    recs = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    metric_values = calc_metrics(metrics, recs, test, train)

    model_quality.update(metric_values)
    results.append(model_quality)

In [38]:
df_quality = pd.DataFrame(results).T
df_quality.columns = df_quality.iloc[0]
df_quality.drop("model", inplace=True)

df_quality

model,random,popular,most_raited,ALS_4,BPR_4,LightFM_logistic_4,LightFM_bpr_4,LightFM_warp_4
MAP@10,0.000201,0.073836,0.074008,0.060967,0.037111,0.074663,0.038852,0.077091


# Download best models

In [61]:
best_models_names = {"LightFM_warp_4", "LightFM_logistic_4", "ALS_4"}
best_models = {k: v for k, v in models.items() if k in best_models_names}
best_models

{'ALS_4': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x1076d3bb0>,
 'LightFM_logistic_4': <rectools.models.lightfm.LightFMWrapperModel at 0x10f238d00>,
 'LightFM_warp_4': <rectools.models.lightfm.LightFMWrapperModel at 0x10f2397e0>}

In [62]:
for model_name, model in tqdm(best_models.items()):
    model_quality = {"model": model_name}
    model.fit(dataset)
    pickle.dump(model, open(f"../model_weights/{model_name}.pkl", "wb"))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 3/3 [02:09<00:00, 43.18s/it]


# Inference example

In [None]:
from models.online_models import DatasetEmpoweredRecommendationModel

model_tmp = DatasetEmpoweredRecommendationModel(
    model_path="../model_weights/LightFM_logistic_4.pkl", dataset_base_path="../data/"
)
model_tmp.get_reco(user_id=0, k_recs=5)

# Dataset with features


In [57]:
users = users.loc[users[Columns.User].isin(interactions.df[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [58]:
items = items.loc[items[Columns.Item].isin(interactions.df[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

In [59]:
dataset = Dataset.construct(
    interactions_df=interactions.df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

# ANN with one of the best models

In [72]:
model = best_models["LightFM_logistic_4"]

In [73]:
from rectools.tools import UserToItemAnnRecommender

user_vectors, item_vectors = model.get_vectors(dataset)
model_ann = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
model_ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x11186e260>

In [74]:
model_ann.get_item_list_for_user(0, top_n=10).tolist()

[7724, 3112, 4407, 7534, 9899, 9708, 1619, 9165, 12152, 428]

In [75]:
pickle.dump(model_ann, open("../model_weights/LightFM_logistic_4_ANN.pkl", "wb"))