# Домашнее задание

Домашнее задание состоит из нескольких блоков.


## Эксперименты в ipynb ноутбуках (15 баллов)
- Необходимо будет перебрать $N$ моделей $(N \geq 2)$ матричной факторизации и перебрать у них $K$ гиперпараметров $(K \geq 2)$ **(6 баллов)**
    - Для перебора гиперпараметров можно использовать [`Optuna`](https://github.com/optuna/optuna), [`Hyperopt`](https://github.com/hyperopt/hyperopt)
- Воспользоваться методом приближенного поиска соседей для выдачи рекомендаций. **(3 балла)**
    - Можно использовать любые удобные: [`Annoy`](https://github.com/spotify/annoy), [`nmslib`](https://github.com/nmslib/nmslib) и.т.д
- Добавить 3 "аватаров" (искусственных пользователей) и посмотреть рекомендации итоговой модели на них. Объяснить почему добавили именно таких пользователей. **(3 балла)**
- Придумать как можно обработать рекомендации для холодных пользователей. **(3 балла)**

Примечание: за невоспроизводимый код в ноутбуках (например, нарушен порядок выполнения ячеек, вызываются переменные, которые нигде не были объявлены ранее и.т.п) будут штрафы на усмотрение проверяющего.


## Реализация итоговой модели в сервисе (10 баллов)
- Пробитие бейзлайна $MAP@10 \geq 0.074921$ **(6 баллов)**
- Код сервиса соответствует критериям читаемости и воспроизводимости **(4 балла)**

In [3]:
import os

In [4]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS'] = '1'

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np
import random
import time

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import (PopularModel, RandomModel, ImplicitALSWrapperModel, 
                             LightFMWrapperModel)
from rectools import Columns
from rectools.dataset import Dataset

import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking
from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization

import optuna
import hnswlib

In [8]:
#set seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [9]:
DATA_PATH = Path("/Users/admin_i/Desktop/itmo/RecoServiceEdu/notebooks/data_original")

# Загрузка данных

In [10]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 4.07 s, sys: 262 ms, total: 4.33 s
Wall time: 4.33 s


# Preprocess

In [11]:
Columns.Datetime = 'last_watch_dt'

In [12]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [13]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [14]:
max_date = interactions[Columns.Datetime].max()

In [15]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [16]:
train = interactions[interactions[Columns.Datetime] < max_date - 2*pd.Timedelta(days=7)].copy()
valid = interactions[(max_date - 2*pd.Timedelta(days=7) <= interactions[Columns.Datetime]) & \
                     (interactions[Columns.Datetime] <= max_date - pd.Timedelta(days=7))].copy()
test = interactions[interactions[Columns.Datetime] > max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"valid: {valid.shape}")
print(f"test: {test.shape}")
print(f"{train.shape[0] + valid.shape[0] + test.shape[0] == interactions.shape[0]}")

train: (4587708, 6)
valid: (464107, 6)
test: (424436, 6)
True


In [17]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [18]:
# отфильтруем холодных пользователей из валида
cold_users = set(valid[Columns.User]) - set(train[Columns.User])
valid.drop(valid[valid[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [19]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [20]:
users.fillna('Unknown', inplace=True)

In [21]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [22]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
...,...,...,...,...,...
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,Unknown,Unknown,Unknown,0
840195,590706,Unknown,Unknown,Ж,0


In [23]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [24]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [25]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [26]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


## Item features

In [27]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [28]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [29]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [30]:
items.nunique()

item_id         13865
content_type        2
title           13310
title_orig       9638
release_year      104
genres           2544
countries         665
for_kids            2
age_rating          6
studios            38
directors        7359
actors          11710
description     13638
keywords        13438
dtype: int64

### Genre

In [32]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [33]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [34]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [35]:
item_features = pd.concat((genre_feature, content_feature))

In [36]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [37]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 945 ms, sys: 79.9 ms, total: 1.02 s
Wall time: 1.02 s


In [38]:
TEST_USERS = test[Columns.User].unique()
VALID_USERS = valid[Columns.User].unique()

In [39]:
TEST_USERS.shape,VALID_USERS.shape

((167348,), (107027,))

# Metrics

In [40]:
metrics_name = {
    # 'Precision': Precision,
    # 'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [10]:
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [41]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

# My models

## Подбор гиперпараметров

In [42]:
K_RECOS = 10
NUM_THREADS = 16
N_EPOCHS = 10 # Lightfm

In [43]:
models = {
    'ALS': AlternatingLeastSquares,
    'LightFM': LightFM
}

In [1]:
# функция для оптимизации в optuna
def opt_metric(trial, dataset, valid_user_ids, top_k, metric, model):
    if model == 'ALS':
        params = {
            'num_threads': NUM_THREADS,
            'random_state': RANDOM_STATE,
            'regularization': trial.suggest_loguniform('regularization', 1e-5, 1.0),
            'factors': trial.suggest_categorical('factors',[32,64,100]),
            'iterations': trial.suggest_categorical('iterations',[20,30,40])           
        }
        rec_model = ImplicitALSWrapperModel(models[model](**params),
                                            fit_features_together = True)
        
    elif model == 'LightFM':
        params = {
            'random_state': RANDOM_STATE,
            'loss': trial.suggest_categorical('loss',['logistic', 'bpr', 'warp']),
            'no_components': trial.suggest_categorical('no_components',
                                                       [32,64,100]),
            'learning_rate': trial.suggest_loguniform('learning_rate',
                                                       1e-5, 0.99),
            'user_alpha': trial.suggest_loguniform('user_alpha',
                                                       1e-5, 0.99),
            'item_alpha': trial.suggest_loguniform('item_alpha',
                                                       1e-5, 0.99)
        }
        rec_model = LightFMWrapperModel(models[model](**params),
                                        epochs=N_EPOCHS, num_threads=NUM_THREADS)
        
    rec_model.fit(dataset)
    recos = rec_model.recommend(
        users=valid_user_ids,
        dataset=dataset,
        k=top_k,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metric, recos, valid)
    map10 = metric_values['MAP@10']
    
    return map10

In [53]:
study = optuna.create_study(direction = 'maximize')

[32m[I 2022-12-12 08:04:02,464][0m A new study created in memory with name: no-name-90f21dd0-ea05-40e7-b8fc-3f6d87a6ca27[0m


In [54]:
func = lambda trial: opt_metric(trial, dataset = dataset, valid_user_ids = VALID_USERS,
                                top_k = K_RECOS, metric = metrics, model = 'ALS')

In [55]:
study.optimize(func, n_jobs = -1, n_trials = 10, show_progress_bar = True)

  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2022-12-12 08:24:47,385][0m Trial 5 finished with value: 0.0590923908737808 and parameters: {'regularization': 3.329587228423343e-05, 'factors': 32, 'iterations': 30}. Best is trial 5 with value: 0.0590923908737808.[0m
[32m[I 2022-12-12 08:29:29,499][0m Trial 6 finished with value: 0.07581063939210497 and parameters: {'regularization': 0.013894886460722865, 'factors': 100, 'iterations': 20}. Best is trial 6 with value: 0.07581063939210497.[0m
[32m[I 2022-12-12 09:09:23,664][0m Trial 7 finished with value: 0.04594552410686065 and parameters: {'regularization': 0.3816171362260296, 'factors': 32, 'iterations': 40}. Best is trial 6 with value: 0.07581063939210497.[0m
[32m[I 2022-12-12 09:21:21,813][0m Trial 2 finished with value: 0.06651980569881531 and parameters: {'regularization': 0.00016150714553188474, 'factors': 64, 'iterations': 30}. Best is trial 6 with value: 0.07581063939210497.[0m
[32m[I 2022-12-12 09:21:26,622][0m Trial 0 finished with value: 0.065394622662

In [56]:
study.best_params

{'regularization': 0.013894886460722865, 'factors': 100, 'iterations': 20}

In [58]:
study2 = optuna.create_study(direction = 'maximize')

[32m[I 2022-12-12 09:39:49,211][0m A new study created in memory with name: no-name-08c53d71-c176-4d39-9da8-2f1060711335[0m


In [60]:
func2 = lambda trial: opt_metric(trial, dataset = dataset, valid_user_ids = VALID_USERS,
                                top_k = K_RECOS, metric = metrics, model = 'LightFM')

In [61]:
study2.optimize(func2, n_jobs = -1, n_trials = 10, show_progress_bar = True)

  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2022-12-12 09:45:57,143][0m Trial 2 finished with value: 0.0002695756042956092 and parameters: {'loss': 'logistic', 'no_components': 32, 'learning_rate': 0.3557749875105895, 'user_alpha': 0.0010805487628487438, 'item_alpha': 5.382014498159319e-05}. Best is trial 2 with value: 0.0002695756042956092.[0m
[32m[I 2022-12-12 09:48:21,896][0m Trial 7 finished with value: 0.0360780822112548 and parameters: {'loss': 'bpr', 'no_components': 32, 'learning_rate': 0.0068816381292813675, 'user_alpha': 0.30461309307876794, 'item_alpha': 0.015224294808610872}. Best is trial 7 with value: 0.0360780822112548.[0m
[32m[I 2022-12-12 09:53:22,894][0m Trial 3 finished with value: 0.00024031997081875686 and parameters: {'loss': 'logistic', 'no_components': 64, 'learning_rate': 0.00028844174584927385, 'user_alpha': 1.6936208016966782e-05, 'item_alpha': 0.00010452688726192654}. Best is trial 7 with value: 0.0360780822112548.[0m
[32m[I 2022-12-12 10:02:46,191][0m Trial 0 finished with value: 4.

In [62]:
study2.best_params

{'loss': 'warp',
 'no_components': 100,
 'learning_rate': 0.0066594231278431434,
 'user_alpha': 0.0005203512950757777,
 'item_alpha': 0.0003116105897809789}

In [64]:
study2.best_value

0.08261434202079407

In [65]:
#сохранение лучших гиперпараметров
best_light_fm_params = {
    'loss': 'warp',
    'no_components': 100,
    'learning_rate': 0.0066594231278431434,
    'user_alpha': 0.0005203512950757777,
    'item_alpha': 0.0003116105897809789,
    'random_state': 42,
}

In [453]:
best_model = LightFMWrapperModel(LightFM(**best_light_fm_params),
                                        epochs=N_EPOCHS, num_threads=NUM_THREADS)

In [454]:
best_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f91484ae790>

# Добавление аватаров

In [455]:
users['user_id'].max()

1097557

In [456]:
def show_rows(title):
    return(items[items['title'].str.contains(title, na=False)])

In [457]:
show_rows('Наруто')

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,genre
2965,4065,film,Наруто 8: Кровавая тюрьма,Naruto Shippuden the Movie: Blood Prison,2011.0,"аниме, приключения",Япония,,12.0,,Масахико Мурата,"Дзюнко Такэути, Тиэ Накамура, Рикия Кояма, Кад...",По сюжету Наруто сажают в тюрьму под названием...,"аниме, наруто, Тюрьма, 2011, япония, кровавая,...","[аниме, приключения]"
7558,11864,film,Наруто 7: Потерянная башня,Naruto Shippûden: The Lost Tower,2010.0,"аниме, приключения",Япония,,12.0,,Масахико Мурата,"Дзюнко Такэути, Тиэ Накамура, Сатоси Хино, Рик...",На Наруто возложена миссия захватить Мукаде (М...,"аниме, наруто, 2010, япония, потерянная, башня","[аниме, приключения]"
14949,13583,film,Наруто 9: Путь ниндзя,Road to Ninja: Naruto the Movie,2012.0,"аниме, фэнтези, приключения",Япония,,12.0,,Хаято Датэ,"Дзюнко Такэути, Тиэ Накамура, Тосиюки Морикава...","Человек в маске, представленный зрителю как Уч...","япония, деревня, любовь, ниндзя, наруто ураган...","[аниме, фэнтези, приключения]"
15580,13202,film,Наруто: Последний фильм,The Last: Naruto the Movie,2014.0,"боевики, аниме, приключения, комедии",Япония,,12.0,,Цунэо Кобаяси,"Дзюнко Такэути, Нана Мидзуки, Дзюн Фукуяма, Ти...",Маленькая Ханаби похищена инопланетным злодеем...,"Наруто ураганные хроники, аниме, наруто, Бой, ...","[боевики, аниме, приключения, комедии]"


Было создано 3 аватара, каждый из которых представляет из себя 3 архетипа:

1. Молодой человек, смотрящий аниме
2. Женщина с ребенком, увлекающаяся просмотром сериалов
3. Аватар неизвестного возраста (предполагается, что ребенка), у коготоро доминируют мультфильмы

В идеале, в топе рекомендаций должны присутствовать произведения соответствующих жанров

In [458]:
#анимешник
user1 = [1097558, 'age_18_24','income_0_20','М',0]
#молодая домохозяйка любящая сериалы
user2 = [1097559, 'age_25_34','income_20_40','Ж',1]
#ребенок любящий мультики
user3 = [1097560, 'Unknown','income_0_20','Ж',0]
new_users = pd.DataFrame([user1,user2,user3],
                         columns = ["user_id","age","income", "sex","kids_flg"])
new_interactions_list = [
    [1097558,13202,'2021-04-13', 1500, 100.0, 3],
    [1097558,8477,'2021-04-14', 1500, 100.0, 3],
    [1097558,3787,'2021-04-15', 1500, 100.0, 3],
    [1097559,14879,'2021-04-13', 1500, 100.0, 3],
    [1097559,6781,'2021-04-14', 1500, 100.0, 3],
    [1097559,16179,'2021-04-15', 1500, 100.0, 3],
    [1097560,5315,'2021-04-13', 1500, 100.0, 3],
    [1097560,12988,'2021-04-14', 1500, 100.0, 3],
    [1097560,14177,'2021-04-15', 1500, 100.0, 3],
]

new_interactions = pd.DataFrame(new_interactions_list, 
                               columns = ["user_id","item_id","last_watch_dt",
                                          "total_dur","watched_pct","weight"])

In [459]:
new_user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = new_users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    new_user_features_frames.append(feature_frame)
new_user_features = pd.concat(new_user_features_frames)
new_user_features.head()

Unnamed: 0,id,value,feature
0,1097558,М,sex
1,1097559,Ж,sex
2,1097560,Ж,sex
0,1097558,age_18_24,age
1,1097559,age_25_34,age


In [460]:
new_train = pd.concat([interactions,new_interactions],ignore_index = True)
new_all_user_features = pd.concat([user_features,new_user_features],
                                  ignore_index = True)

In [461]:
new_dataset = Dataset.construct(
    interactions_df=new_train,
    user_features_df=new_all_user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [462]:
best_new_model = LightFMWrapperModel(LightFM(**best_light_fm_params),
                                        epochs=N_EPOCHS, num_threads=NUM_THREADS)

In [463]:
best_new_model.fit(new_dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f9172ec5dc0>

In [464]:
new_recos = best_new_model.recommend(
    users=[user1[0],user2[0],user3[0]],
    dataset=new_dataset,
    k=10,
    filter_viewed=True,
)

In [465]:
items[['item_id','title','genres']].merge(new_recos, how = 'right', on = 'item_id')

Unnamed: 0,item_id,title,genres,user_id,score,rank
0,4151,Секреты семейной жизни,комедии,1097558,-22.607544,1
1,7571,100% волк,"мультфильм, приключения, семейное, фэнтези, ко...",1097558,-22.702321,2
2,15297,Клиника счастья,"драмы, мелодрамы",1097558,-22.708507,3
3,4880,Афера,комедии,1097558,-22.783512,4
4,3734,Прабабушка легкого поведения,комедии,1097558,-22.832664,5
5,10440,Хрустальный,"триллеры, детективы",1097558,-22.85398,6
6,13865,Девятаев,"драмы, военные, приключения",1097558,-23.046195,7
7,9728,Гнев человеческий,"боевики, триллеры",1097558,-23.094686,8
8,2657,Подслушано,"драмы, триллеры",1097558,-23.305981,9
9,11237,День города,комедии,1097558,-23.325321,10


Как видим из результатов, больше всего совпадений по жанрам у 3го аватара:
в рекомендациях присутствует 3 мультфильма

# Approximate Nearest Neighbors 

In [480]:
user_embeddings, item_embeddings = best_new_model.get_vectors(new_dataset)

In [481]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [482]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15706, 102)


(15706, 103)

In [483]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(962182, 103)

In [484]:
# параметры для hnsw
M = 48
efC = 100
efS = 100

In [485]:
%%time
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim)
hnsw.init_index(max_elements, M, efC)
hnsw.add_items(augmented_item_embeddings)
hnsw.set_ef(efS)

CPU times: user 1.49 s, sys: 117 ms, total: 1.6 s
Wall time: 286 ms


In [486]:
def recommend_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpartition_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpartition_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpartition_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    distances = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distances

In [487]:
recommend_all(user_embeddings[[0], :], item_embeddings)

(array([[ 32, 235,  16,  10,  84,  25,  51, 596,  18, 202]]),
 array([[-34.25965878, -34.39183911, -34.43723705, -34.69208954,
         -34.89087456, -35.00085291, -35.0108253 , -35.04676144,
         -35.10954488, -35.13816857]]))

In [488]:
label, distance = hnsw.knn_query(augmented_user_embeddings[0], k=k)

In [489]:
label, 1 - distance

(array([[ 32, 235,  16,  10,  84,  25,  51, 596,  18, 202]], dtype=uint64),
 array([[-34.25965 , -34.39183 , -34.437237, -34.692093, -34.89087 ,
         -35.00086 , -35.010822, -35.046757, -35.109547, -35.13818 ]],
       dtype=float32))

### Функция для приближенного поиска рекомендаций

In [490]:
def approx_candidates(augmented_user_embeddings, user_ids, hnsw,
                      item_id_map, user_id_map, top_k):
    internal_user_ids = user_id_map.convert_to_internal(user_ids)
    query = augmented_user_embeddings[internal_user_ids]
    item_labels, _ = hnsw.knn_query(query, k=top_k)
    real_labels = np.array(list(map(item_id_map.convert_to_external,item_labels)))
    return (real_labels)

In [493]:
# поиск рекомендаций для аватаров
approx_candidates(augmented_user_embeddings,[1097558,1097559,1097560],hnsw,
                  new_dataset.item_id_map,new_dataset.user_id_map,10)

array([[ 4151,  7571, 15297,  4880,  3734, 10440, 13865,  9728,  2657,
        11237],
       [15297, 10440,  4151,  2657, 12192,  9996,  4880,  3734,  6192,
         9728],
       [ 7571,  4151,  3734, 15297, 10440,  3182, 13865,  9728,  7829,
        16166]])

In [495]:
best_new_model.recommend(
    users=[1097558,1097559,1097560],
    dataset=new_dataset,
    k=10,
    filter_viewed=True,
)

Unnamed: 0,user_id,item_id,score,rank
0,1097558,4151,-22.607544,1
1,1097558,7571,-22.702321,2
2,1097558,15297,-22.708507,3
3,1097558,4880,-22.783512,4
4,1097558,3734,-22.832664,5
5,1097558,10440,-22.85398,6
6,1097558,13865,-23.046195,7
7,1097558,9728,-23.094686,8
8,1097558,2657,-23.305981,9
9,1097558,11237,-23.325321,10


Видим, что функция приближенного поиска выдает практические такие же рекомендации

# Подсчет метрик и класс для прогнозирования

## Загрузка данных

In [538]:
%%time
# объединими трейн и валидацию, тест выборку оставим без изменений
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

Columns.Datetime = 'last_watch_dt'
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

train = interactions[interactions[Columns.Datetime] <= max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] > max_date - pd.Timedelta(days=7)].copy()

train.drop(train.query("total_dur < 300").index, inplace=True)

users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()


items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

TEST_USERS = test[Columns.User].unique()

CPU times: user 9.56 s, sys: 1.55 s, total: 11.1 s
Wall time: 11.4 s


## Обработка холодных пользователей

In [539]:
popular_model = PopularModel()
popular_model.fit(dataset)

# если id пользователя нет в датасете, то выдаем топ 10 популярных айтемов
popular_items = dataset.item_id_map.convert_to_external(
                popular_model.popularity_list[0][:10])
popular_items

array([10440, 15297, 13865,  9728,  4151,  3734,  2657,  4880,   142,
        6809])

## Подсчет метрики

In [540]:
best_light_fm_params = {
    'loss': 'warp',
    'no_components': 100,
    'learning_rate': 0.0066594231278431434,
    'user_alpha': 0.0005203512950757777,
    'item_alpha': 0.0003116105897809789,
    'random_state': 42,
}

In [541]:
N_EPOCHS,NUM_THREADS

(10, 16)

In [542]:
best_model = LightFMWrapperModel(LightFM(**best_light_fm_params),
                                        epochs=N_EPOCHS, num_threads=NUM_THREADS)

In [543]:
best_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f914849cf10>

In [597]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

In [601]:
COLD_TEST_USERS = np.array(list(set(TEST_USERS) - set(train['user_id'])))
HOT_TEST_USERS = np.array(list(set(TEST_USERS) & set(train['user_id'])))

In [602]:
COLD_USERS.shape[0] + HOT_USERS.shape[0]

167348

In [603]:
test['user_id'].unique().shape[0]

167348

In [604]:
recos_hot = best_model.recommend(
    users=HOT_TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [605]:
recos_cold_data = {"user_id":[user for user in COLD_TEST_USERS.tolist()\
                              for i in range(10)],
                   "item_id":popular_items.tolist()*COLD_TEST_USERS.shape[0],
                   "score":[-i for i in range(1,11)]*COLD_TEST_USERS.shape[0],
                   "rank":[i for i in range(1,11)]*COLD_TEST_USERS.shape[0],}
recos_cold = pd.DataFrame(recos_cold_data)

In [606]:
recos = pd.concat([recos_hot,recos_cold])

In [607]:
metric_values = calc_metrics(metrics, recos, test)
metric_values

{'MAP@10': 0.11309423787834331}

## Инференс

### Обучение на всех данных

In [612]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

Columns.Datetime = 'last_watch_dt'
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

train = interactions.copy()

train.drop(train.query("total_dur < 300").index, inplace=True)

users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()


items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 9.51 s, sys: 1.61 s, total: 11.1 s
Wall time: 11.4 s


In [613]:
best_model = LightFMWrapperModel(LightFM(**best_light_fm_params),
                                        epochs=N_EPOCHS, num_threads=NUM_THREADS)
best_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f91483610a0>

In [614]:
#популярные айтемы для холодных пользователей
popular_model = PopularModel()
popular_model.fit(dataset)
popular_items = dataset.item_id_map.convert_to_external(
                popular_model.popularity_list[0][:10])
popular_items

array([10440, 15297, 13865,  9728,  4151,  3734,  2657,  4880,   142,
        6809])

In [669]:
from joblib import dump
from joblib import load

In [670]:
# сохранение моделей
dump(best_model,"models/main_model.joblib")
dump(popular_model,"models/popular_model.joblib")
dump(dataset,"models/dataset.joblib")

['models/dataset.joblib']

### Для бота

In [671]:
class FmModel:
    def __init__(self, model, popular_model, dataset):
        self.model = model
        self.popular_model = popular_model
        self.dataset = dataset
        self.item_id_map = dataset.item_id_map
        self.user_id_map = dataset.user_id_map
        self.popular_items = dataset.item_id_map.convert_to_external(popular_model
                                                                     .popularity_list[0][:10])
         
        user_embeddings, item_embeddings = self.model.get_vectors(dataset)
        max_norm, augmented_item_embeddings = self.augment_inner_product(item_embeddings)
        extra_zero = np.zeros((user_embeddings.shape[0], 1))
        augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
        
        M = 48
        efC = 100
        efS = 100
        
        max_elements, dim = augmented_item_embeddings.shape
        hnsw = hnswlib.Index("ip", dim)
        hnsw.init_index(max_elements, M, efC)
        hnsw.add_items(augmented_item_embeddings)
        hnsw.set_ef(efS)
        
        self.augmented_item_embeddings = augmented_item_embeddings
        self.augmented_user_embeddings = augmented_user_embeddings
        self.hnsw = hnsw
        
    def __call__(self, user_id):
        rec_items = self.get_top_10(user_id)
        return rec_items
    
    def get_top_10(self, user_id):
        try:
            recos = self.approx_candidates(user_id,top_k = 10)
            return recos
        
        except KeyError:
            return self.popular_items
                      
    def approx_candidates(self, user_ids, top_k = 10):       
        internal_user_ids = self.user_id_map.convert_to_internal([user_ids])
        query = self.augmented_user_embeddings[internal_user_ids]
        item_labels, _ = self.hnsw.knn_query(query, k=top_k)
        real_labels = np.array(list(map(self.item_id_map.convert_to_external,item_labels)))
        if real_labels.shape[0] == 1:
            real_labels = real_labels[0]
        return real_labels
    
    def augment_inner_product(self, factors):
        normed_factors = np.linalg.norm(factors, axis=1)
        max_norm = normed_factors.max()

        extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
        augmented_factors = np.append(factors, extra_dim, axis=1)
        return max_norm, augmented_factors

In [672]:
best_model = load("models/main_model.joblib")
popular_model = load("models/popular_model.joblib")
dataset = load("models/dataset.joblib")

In [673]:
test_model = FmModel(best_model,popular_model,dataset)

In [674]:
test_model(1232)

array([ 9728, 10440, 13865, 15297,   142,  3734,  7829,  7102,  7793,
        2657])

In [675]:
#холодный пользователь
test_model(1232123123),test_model(999999999)

(array([10440, 15297, 13865,  9728,  4151,  3734,  2657,  4880,   142,
         6809]),
 array([10440, 15297, 13865,  9728,  4151,  3734,  2657,  4880,   142,
         6809]))