# Home Work

В этой работе будем иследовать одну из популярнейших рекомендательных моделей - **Latent Factor Model** - https://arxiv.org/pdf/1912.04754. 

Перед выполнением задания нужно убедиться, что прогоняется бейзлайн. Для этого:
1) Скачайте  файлы - **node2name.json** и **clickstream.parque** с необходимыми данными
2) Положите в репозиторий ноутбука и запустите код

В этой работе вам нужно:
1) перебрать параметры модели - edim,batch_size, lr, epoch , num_negatives -   (по **1 балу - 5 балов**) 
2) Тип OPTIMIZER_NAME - (**4 бала за 5 оптимизаторов**)
3) На основе имеющихся данных собрать лучшую модель (по **precision@30**) и рассчитать ее метрики (**4 бала**)
4) Попробовать другие модели (например  als - https://benfred.github.io/implicit/ , gru4rec, sasrec  ) - за sasrec на хорошем уровне сразу **10 балов**. За другие модели по **3 бала**
5) По окончанию работы в mlflow настроить графики для сравнения моделей. Можно проявить фантазию, но обязательно должно быть сравнение с бейзлайном (данный ноутбук) против других моделей
6) В mlflow залогировать последнюю версию ноутбука - необходимое условия. Либо в github, но тогда прикрепить ссылку в [mlflow](http://84.201.128.89:90/) . Эксперимент в формате - **homework-\<name\>**
7) Доп балы (**20 баллов**) тому у кого будет наибольший скор на тесте. Но ваш ноутбук должен прогонятся и быть вопроизводимым.

Суммарно за работу **20 балов**

In [1]:
import os
import random
import torch
import numpy as np

In [2]:
# задаем значение гениратора случайных чисел
RANDOM_STATE = 1

In [3]:
def set_seed(seed: int):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

## Загружаем данные

In [4]:
import json

with open('data/node2name.json', 'r') as f:
    node2name = json.load(f)

node2name = {int(k):v for k,v in node2name.items()}

In [5]:
import pandas as pd

df = pd.read_parquet('data/clickstream.parque')
df = df.head(100_000)

In [6]:
df['is_train'] = df['event_date']< df['event_date'].max() - pd.Timedelta('2 day')
df['names'] = df['node_id'].map(node2name)

In [7]:
train_cooks = df[df['is_train']]['cookie_id'].unique()
train_items = df[df['is_train']]['node_id'].unique()


df = df[(df['cookie_id'].isin(train_cooks)) & (df['node_id'].isin(train_items))]

In [8]:
user_indes, index2user_id = pd.factorize(df['cookie_id'])
df['user_index'] = user_indes

node_indes, index2node = pd.factorize(df['node_id'])
df['node_index'] = node_indes

In [9]:
df['node_index'].max()

2175

In [10]:
df_train, df_test = df[df['is_train']], df[~df['is_train']]
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


df_train.shape, df_test.shape

((96611, 7), (3333, 7))

# Определяем модель

In [11]:
import torch
from torch import nn
import random 
from tqdm.auto import tqdm

from torch.utils.data import Dataset, DataLoader


class RecDataset(Dataset):
    def __init__(self, users, items, item_per_users):
        self.users = users
        self.items = items
        self.item_per_users=item_per_users

    def __len__(self):
        return len(self.users)

    def __getitem__(self, i):
        user = self.users[i]
        return torch.tensor(user), torch.tensor(self.items[i]), self.item_per_users[user]


class LatentFactorModel(nn.Module):
    def __init__(self, edim, user_indexes, node_indexes):
        super(LatentFactorModel, self).__init__()
        self.edim = edim
        self.users = nn.Embedding(max(user_indexes) + 1, edim)
        self.items = nn.Embedding(max(node_indexes) + 1, edim)

    def forward(self, users, items):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items(items)
        res = torch.einsum('be,bne->bn', user_embedings, item_embedings)
        return res 

    def pred_top_k(self, users, K=10):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items.weight
        res = torch.einsum('ue,ie->ui', user_embedings, item_embedings)
        return torch.topk(res, K, dim=1)

    


def collate_fn(batch, num_negatives, num_items):
    users, target_items, users_negatives = [],[], []
    for triplets in batch:
        user, target_item, seen_item = triplets
        
        users.append(user)
        target_items.append(target_item)
        user_negatives = []
        
        while len(user_negatives)< num_negatives:
            candidate = random.randint(0, num_items)
            if candidate not in seen_item:
                user_negatives.append(candidate)
                
        users_negatives.append(user_negatives)
                
    
    positive = torch.ones(len(batch), 1)       
    negatives = torch.zeros(len(batch), num_negatives)
    labels = torch.hstack([positive, negatives])
    # print(torch.tensor(target_items))
    # print(users_negatives)
    items = torch.hstack([torch.tensor(target_items).reshape(-1, 1), torch.tensor(users_negatives)])
    return torch.hstack(users), items, labels

In [12]:
user2seen = df_train.groupby('user_index')['node_index'].agg(lambda x: list(set(x)))

# 1. Перебор параметров (с помощью оптуны)

In [13]:
# заводим mlflow
import mlflow

mlflow.set_tracking_uri('http://84.201.128.89:90/')
mlflow.set_experiment('homework-mekiselev')

<Experiment: artifact_location='mlflow-artifacts:/18', creation_time=1716048375923, experiment_id='18', last_update_time=1716048375923, lifecycle_stage='active', name='homework-mekiselev', tags={}>

In [14]:
import optuna

def calc_hitrate(df_preds, K):
    return  df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].max().mean()


def calc_prec(df_preds, K):
    return  (df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].mean()).mean()


def get_metrics(model: torch.nn.Module, K: int = 30):
    test_users = df_test['user_index'].unique()

    preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
    df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

    df_preds = df_preds.explode(['node_index', 'rank']).merge(
        df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
        on = ['user_index', 'node_index'],
        how='left'
    )
    df_preds['relevant'] = df_preds['relevant'].fillna(0)

    hitrate = calc_hitrate(df_preds, K)
    prec = calc_prec(df_preds, K)
    return hitrate, prec


def objective(trial):
    BATCH_SIZE = trial.suggest_int('BATCH_SIZE', 16, 100_000)
    NUM_NEGATIVES = trial.suggest_int('NUM_NEGATIVES', 1, 20)
    EDIM = trial.suggest_int('EDIM', 16, 512)
    EPOCH = trial.suggest_int('EPOCH', 3, 20)
    OPTIMIZER_NAME = 'Adam'
    LR = trial.suggest_float('LR', 1e-3, 1e1, log=True)

    train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
    dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=BATCH_SIZE,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

    set_seed(seed=RANDOM_STATE) # фиксируем сид (иначе не будет воспроизводимости)
    model = LatentFactorModel(EDIM, user_indes, node_indes)
    optimizer = torch.optim.Adam(model.parameters(), LR)

    for i in range(EPOCH):
        losses = []
        for i in dataloader:
            users, items, labels = i
            optimizer.zero_grad()
            logits = model(users, items)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                logits, labels
            )
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

    hitrate, prec = get_metrics(model, K=30)

    path_to_save = f'model_weights_{trial.number}.pt'
    torch.save(model.state_dict(), path_to_save)
    # логруем результат в mlflow
    with mlflow.start_run(run_name=f'optuna-{trial.number}'):
        mlflow.log_metrics(
            {
                'hitrate-at-30': hitrate, 
                'precision-at-30': prec
            },
        )
        mlflow.log_params(
            {
            'BATCH_SIZE': BATCH_SIZE,
            'NUM_NEGATIVES': NUM_NEGATIVES,
            'EDIM': EDIM,
            'EPOCH': EPOCH,
            'OPTIMIZER_NAME': OPTIMIZER_NAME,
            'LR': EPOCH,
            }
        )
        # mlflow.log_artifact( # не работает с большими файлами
        #     path_to_save,
        #     'models'
        # )
    return prec # будем максимизировать precision

In [15]:
# запускаем оптуну
tpe_sampler = optuna.samplers.TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(sampler=tpe_sampler, direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-05-25 13:09:15,592] A new study created in memory with name: no-name-14bfadf9-c4e3-42dd-8773-940b068e9590


Random seed set as 1


[I 2024-05-25 13:09:35,629] Trial 0 finished with value: 0.0013493253373313345 and parameters: {'BATCH_SIZE': 41711, 'NUM_NEGATIVES': 15, 'EDIM': 16, 'EPOCH': 8, 'LR': 0.00386387940515873}. Best is trial 0 with value: 0.0013493253373313345.


Random seed set as 1


[I 2024-05-25 13:09:54,450] Trial 1 finished with value: 0.04497751124437781 and parameters: {'BATCH_SIZE': 9248, 'NUM_NEGATIVES': 4, 'EDIM': 187, 'EPOCH': 10, 'LR': 0.14297724879798399}. Best is trial 1 with value: 0.04497751124437781.


Random seed set as 1


[I 2024-05-25 13:10:43,855] Trial 2 finished with value: 0.0016491754122938533 and parameters: {'BATCH_SIZE': 41929, 'NUM_NEGATIVES': 14, 'EDIM': 117, 'EPOCH': 18, 'LR': 0.0012869165107815027}. Best is trial 1 with value: 0.04497751124437781.


Random seed set as 1


[I 2024-05-25 13:10:59,860] Trial 3 finished with value: 0.0009995002498750627 and parameters: {'BATCH_SIZE': 67052, 'NUM_NEGATIVES': 9, 'EDIM': 293, 'EPOCH': 5, 'LR': 0.006200203677164719}. Best is trial 1 with value: 0.04497751124437781.


Random seed set as 1


[I 2024-05-25 13:11:53,559] Trial 4 finished with value: 0.045327336331834084 and parameters: {'BATCH_SIZE': 80078, 'NUM_NEGATIVES': 20, 'EDIM': 171, 'EPOCH': 15, 'LR': 3.2029975700290483}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:12:00,617] Trial 5 finished with value: 0.021139430284857572 and parameters: {'BATCH_SIZE': 89463, 'NUM_NEGATIVES': 2, 'EDIM': 35, 'EPOCH': 6, 'LR': 3.25514256249193}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:12:38,439] Trial 6 finished with value: 0.0432783608195902 and parameters: {'BATCH_SIZE': 9849, 'NUM_NEGATIVES': 9, 'EDIM': 492, 'EPOCH': 12, 'LR': 0.5854751355295726}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:12:49,965] Trial 7 finished with value: 0.04192903548225887 and parameters: {'BATCH_SIZE': 31562, 'NUM_NEGATIVES': 14, 'EDIM': 430, 'EPOCH': 3, 'LR': 1.0013300735281483}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:13:39,420] Trial 8 finished with value: 0.0013993003498250875 and parameters: {'BATCH_SIZE': 98887, 'NUM_NEGATIVES': 15, 'EDIM': 155, 'EPOCH': 17, 'LR': 0.0025876410154210506}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:14:05,945] Trial 9 finished with value: 0.0012493753123438284 and parameters: {'BATCH_SIZE': 44798, 'NUM_NEGATIVES': 19, 'EDIM': 161, 'EPOCH': 8, 'LR': 0.0033121827306255114}. Best is trial 4 with value: 0.045327336331834084.


Random seed set as 1


[I 2024-05-25 13:15:09,403] Trial 10 finished with value: 0.04592703648175912 and parameters: {'BATCH_SIZE': 74054, 'NUM_NEGATIVES': 20, 'EDIM': 322, 'EPOCH': 14, 'LR': 5.979525255233269}. Best is trial 10 with value: 0.04592703648175912.


Random seed set as 1


[I 2024-05-25 13:16:07,652] Trial 11 finished with value: 0.046776611694152925 and parameters: {'BATCH_SIZE': 70806, 'NUM_NEGATIVES': 19, 'EDIM': 313, 'EPOCH': 14, 'LR': 9.407916196919608}. Best is trial 11 with value: 0.046776611694152925.


Random seed set as 1


[I 2024-05-25 13:17:04,033] Trial 12 finished with value: 0.047426286856571714 and parameters: {'BATCH_SIZE': 68114, 'NUM_NEGATIVES': 18, 'EDIM': 329, 'EPOCH': 14, 'LR': 5.106820625932442}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:18:23,850] Trial 13 finished with value: 0.04222888555722139 and parameters: {'BATCH_SIZE': 64563, 'NUM_NEGATIVES': 17, 'EDIM': 358, 'EPOCH': 20, 'LR': 0.03486808367826234}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:19:18,568] Trial 14 finished with value: 0.045077461269365314 and parameters: {'BATCH_SIZE': 56879, 'NUM_NEGATIVES': 18, 'EDIM': 385, 'EPOCH': 13, 'LR': 0.9149530799982564}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:20:02,905] Trial 15 finished with value: 0.04482758620689655 and parameters: {'BATCH_SIZE': 81938, 'NUM_NEGATIVES': 11, 'EDIM': 245, 'EPOCH': 16, 'LR': 8.874165906134635}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:20:32,267] Trial 16 finished with value: 0.04517741129435282 and parameters: {'BATCH_SIZE': 56407, 'NUM_NEGATIVES': 12, 'EDIM': 244, 'EPOCH': 10, 'LR': 0.179926136110671}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:21:49,644] Trial 17 finished with value: 0.03938030984507746 and parameters: {'BATCH_SIZE': 24126, 'NUM_NEGATIVES': 17, 'EDIM': 413, 'EPOCH': 19, 'LR': 0.0286710413690837}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:23:39,256] Trial 18 finished with value: 0.033233383308345826 and parameters: {'BATCH_SIZE': 96401, 'NUM_NEGATIVES': 17, 'EDIM': 470, 'EPOCH': 11, 'LR': 2.3384535382064855}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:24:14,550] Trial 19 finished with value: 0.04272863568215892 and parameters: {'BATCH_SIZE': 70099, 'NUM_NEGATIVES': 6, 'EDIM': 330, 'EPOCH': 15, 'LR': 0.43283619051234784}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:24:54,571] Trial 20 finished with value: 0.04582708645677162 and parameters: {'BATCH_SIZE': 85044, 'NUM_NEGATIVES': 13, 'EDIM': 272, 'EPOCH': 13, 'LR': 9.316645166969703}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:25:58,508] Trial 21 finished with value: 0.046576711644177905 and parameters: {'BATCH_SIZE': 69689, 'NUM_NEGATIVES': 20, 'EDIM': 321, 'EPOCH': 15, 'LR': 4.9351826460258525}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:27:11,319] Trial 22 finished with value: 0.045627186406796603 and parameters: {'BATCH_SIZE': 58162, 'NUM_NEGATIVES': 20, 'EDIM': 353, 'EPOCH': 17, 'LR': 1.6122026059723826}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:27:56,715] Trial 23 finished with value: 0.045627186406796603 and parameters: {'BATCH_SIZE': 75641, 'NUM_NEGATIVES': 16, 'EDIM': 219, 'EPOCH': 14, 'LR': 4.707247503337901}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:28:56,329] Trial 24 finished with value: 0.043778110944527736 and parameters: {'BATCH_SIZE': 62314, 'NUM_NEGATIVES': 18, 'EDIM': 292, 'EPOCH': 16, 'LR': 0.3605152937009567}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:29:49,394] Trial 25 finished with value: 0.04592703648175912 and parameters: {'BATCH_SIZE': 52047, 'NUM_NEGATIVES': 19, 'EDIM': 390, 'EPOCH': 12, 'LR': 1.8557138853476258}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:31:29,113] Trial 26 finished with value: 0.04552723638180909 and parameters: {'BATCH_SIZE': 90465, 'NUM_NEGATIVES': 18, 'EDIM': 435, 'EPOCH': 14, 'LR': 4.288163771919351}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:32:33,081] Trial 27 finished with value: 0.04667666166916542 and parameters: {'BATCH_SIZE': 72632, 'NUM_NEGATIVES': 16, 'EDIM': 314, 'EPOCH': 18, 'LR': 9.678289030709719}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:33:37,969] Trial 28 finished with value: 0.044427786106946525 and parameters: {'BATCH_SIZE': 78947, 'NUM_NEGATIVES': 16, 'EDIM': 216, 'EPOCH': 20, 'LR': 0.05814185680710665}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:34:39,377] Trial 29 finished with value: 0.044077961019490255 and parameters: {'BATCH_SIZE': 33790, 'NUM_NEGATIVES': 15, 'EDIM': 366, 'EPOCH': 17, 'LR': 9.539727964639411}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:35:28,203] Trial 30 finished with value: 0.003548225887056472 and parameters: {'BATCH_SIZE': 48448, 'NUM_NEGATIVES': 9, 'EDIM': 297, 'EPOCH': 18, 'LR': 0.011192463765709868}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:36:32,032] Trial 31 finished with value: 0.04602698650674662 and parameters: {'BATCH_SIZE': 70406, 'NUM_NEGATIVES': 19, 'EDIM': 326, 'EPOCH': 15, 'LR': 1.2517249290246275}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:37:06,514] Trial 32 finished with value: 0.04582708645677162 and parameters: {'BATCH_SIZE': 62957, 'NUM_NEGATIVES': 16, 'EDIM': 272, 'EPOCH': 10, 'LR': 5.284830602857131}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:38:25,553] Trial 33 finished with value: 0.04542728635682158 and parameters: {'BATCH_SIZE': 72370, 'NUM_NEGATIVES': 20, 'EDIM': 325, 'EPOCH': 18, 'LR': 2.757595234245188}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:39:31,011] Trial 34 finished with value: 0.04577711144427786 and parameters: {'BATCH_SIZE': 87053, 'NUM_NEGATIVES': 18, 'EDIM': 302, 'EPOCH': 16, 'LR': 6.189794654364056}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:40:14,033] Trial 35 finished with value: 0.044477761119440275 and parameters: {'BATCH_SIZE': 66395, 'NUM_NEGATIVES': 6, 'EDIM': 252, 'EPOCH': 19, 'LR': 2.7790446271322446}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:40:54,484] Trial 36 finished with value: 0.04557721139430285 and parameters: {'BATCH_SIZE': 77397, 'NUM_NEGATIVES': 15, 'EDIM': 207, 'EPOCH': 13, 'LR': 0.7337649042323618}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:41:23,048] Trial 37 finished with value: 0.04032983508245877 and parameters: {'BATCH_SIZE': 54086, 'NUM_NEGATIVES': 13, 'EDIM': 74, 'EPOCH': 11, 'LR': 0.17190704265715}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:43:09,539] Trial 38 finished with value: 0.0456271864067966 and parameters: {'BATCH_SIZE': 91247, 'NUM_NEGATIVES': 19, 'EDIM': 387, 'EPOCH': 15, 'LR': 1.7344355452600275}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:43:38,665] Trial 39 finished with value: 0.04472763618190904 and parameters: {'BATCH_SIZE': 68021, 'NUM_NEGATIVES': 14, 'EDIM': 349, 'EPOCH': 8, 'LR': 3.7202976994104695}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:44:52,470] Trial 40 finished with value: 0.0415792103948026 and parameters: {'BATCH_SIZE': 2778, 'NUM_NEGATIVES': 17, 'EDIM': 453, 'EPOCH': 16, 'LR': 0.2793915989030075}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:46:01,373] Trial 41 finished with value: 0.04602698650674662 and parameters: {'BATCH_SIZE': 71373, 'NUM_NEGATIVES': 19, 'EDIM': 327, 'EPOCH': 15, 'LR': 1.1925925780191788}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:47:01,255] Trial 42 finished with value: 0.045877061469265366 and parameters: {'BATCH_SIZE': 60176, 'NUM_NEGATIVES': 20, 'EDIM': 305, 'EPOCH': 14, 'LR': 7.014730182620286}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:47:54,467] Trial 43 finished with value: 0.04632683658170914 and parameters: {'BATCH_SIZE': 82542, 'NUM_NEGATIVES': 19, 'EDIM': 270, 'EPOCH': 13, 'LR': 4.058105551966542}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:48:14,142] Trial 44 finished with value: 0.03853073463268366 and parameters: {'BATCH_SIZE': 82469, 'NUM_NEGATIVES': 1, 'EDIM': 278, 'EPOCH': 12, 'LR': 3.970050788868201}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:48:58,614] Trial 45 finished with value: 0.044977511244377814 and parameters: {'BATCH_SIZE': 77244, 'NUM_NEGATIVES': 18, 'EDIM': 186, 'EPOCH': 13, 'LR': 6.363634479740375}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:49:48,711] Trial 46 finished with value: 0.03933033483258371 and parameters: {'BATCH_SIZE': 95195, 'NUM_NEGATIVES': 20, 'EDIM': 229, 'EPOCH': 12, 'LR': 9.310713900749981}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:50:18,312] Trial 47 finished with value: 0.041929035482258875 and parameters: {'BATCH_SIZE': 83062, 'NUM_NEGATIVES': 17, 'EDIM': 139, 'EPOCH': 9, 'LR': 2.432144064183068}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:50:42,542] Trial 48 finished with value: 0.03633183408295852 and parameters: {'BATCH_SIZE': 47837, 'NUM_NEGATIVES': 19, 'EDIM': 370, 'EPOCH': 5, 'LR': 3.6899910487430927}. Best is trial 12 with value: 0.047426286856571714.


Random seed set as 1


[I 2024-05-25 13:52:06,529] Trial 49 finished with value: 0.04557721139430285 and parameters: {'BATCH_SIZE': 65858, 'NUM_NEGATIVES': 18, 'EDIM': 406, 'EPOCH': 17, 'LR': 6.374981357852146}. Best is trial 12 with value: 0.047426286856571714.


In [16]:
optuna_best_params = study.best_params
optuna_best_params

{'BATCH_SIZE': 68114,
 'NUM_NEGATIVES': 18,
 'EDIM': 329,
 'EPOCH': 14,
 'LR': 5.106820625932442}

# 2. Перебираем оптимизаторы

In [17]:
def fit_model(name_trial: str, config: dict):
    BATCH_SIZE = config['BATCH_SIZE']
    NUM_NEGATIVES = config['NUM_NEGATIVES']
    EDIM = config['EDIM']
    EPOCH = config['EPOCH']
    OPTIMIZER_NAME = config['optimizer'].__name__
    LR = config['LR']

    train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
    dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=BATCH_SIZE,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

    set_seed(seed=RANDOM_STATE) # фиксируем сид (иначе не будет воспроизводимости)
    model = LatentFactorModel(EDIM, user_indes, node_indes)
    optimizer = config['optimizer'](model.parameters(), LR)

    for i in range(EPOCH):
        losses = []
        for i in dataloader:
            users, items, labels = i
            optimizer.zero_grad()
            logits = model(users, items)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                logits, labels
            )
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

    hitrate, prec = get_metrics(model, K=30)

    path_to_save = f'model_weights_{name_trial}.pt'
    torch.save(model.state_dict(), path_to_save)
    # логруем результат в mlflow
    with mlflow.start_run(run_name=name_trial):
        mlflow.log_metrics(
            {
                'hitrate-at-30': hitrate, 
                'precision-at-30': prec
            },
        )
        mlflow.log_params(
            {
            'BATCH_SIZE': BATCH_SIZE,
            'NUM_NEGATIVES': NUM_NEGATIVES,
            'EDIM': EDIM,
            'EPOCH': EPOCH,
            'OPTIMIZER_NAME': OPTIMIZER_NAME,
            'LR': EPOCH,
            }
        )
        # mlflow.log_artifact( # не работает с большими файлами
        #     path_to_save,
        #     'models'
        # )
    return hitrate, prec

In [18]:
from torch.optim import AdamW, Adagrad, RMSprop, SGD, ASGD

# Adam уже есть
for optimizer in [AdamW, Adagrad, RMSprop, SGD, ASGD]:
    hitrate, prec = fit_model(
        name_trial=f'optimizer: {optimizer.__name__}',
        config={**optuna_best_params, 'optimizer': optimizer}
    )
    print(f'{optimizer.__name__}:\n\thitrate: {hitrate}\n\tprecision: {prec}', end='\n\n')

Random seed set as 1
AdamW:
	hitrate: 0.7301349325337332
	precision: 0.04532733633183408

Random seed set as 1
Adagrad:
	hitrate: 0.7226386806596702
	precision: 0.04532733633183408

Random seed set as 1
RMSprop:
	hitrate: 0.7346326836581709
	precision: 0.04672663668165917

Random seed set as 1
SGD:
	hitrate: 0.037481259370314844
	precision: 0.0012493753123438282

Random seed set as 1
ASGD:
	hitrate: 0.037481259370314844
	precision: 0.0012493753123438282



# 3. Выбор лучшей модели

Среди всех моделей лучшей оказалась модель с 12 итерации оптуны с `precision@30: 0.0474`. Ещё раз соберем её

In [19]:
BATCH_SIZE = optuna_best_params['BATCH_SIZE']
NUM_NEGATIVES = optuna_best_params['NUM_NEGATIVES']
EDIM = optuna_best_params['EDIM']
EPOCH = optuna_best_params['EPOCH']
OPTIMIZER_NAME = 'Adam'
LR = optuna_best_params['LR']

train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=BATCH_SIZE,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

set_seed(seed=RANDOM_STATE) # фиксируем сид (иначе не будет воспроизводимости)
model = LatentFactorModel(EDIM, user_indes, node_indes)
optimizer = torch.optim.Adam(model.parameters(), LR)

for i in range(EPOCH):
    losses = []
    for i in dataloader:
        users, items, labels = i
        optimizer.zero_grad()
        logits = model(users, items)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels
        )
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

hitrate, prec = get_metrics(model, K=30)

print(hitrate, prec)

Random seed set as 1
0.7421289355322339 0.047426286856571714


In [20]:
# mlflow.log_artifact( # не работает с большими файлами
#     '/Users/max/Desktop/aaa-versioning-hw/model_weights_12.pt'
# )

# 4. Пробуем другие модели

## ALS

In [21]:
from scipy.sparse import csr_matrix

N_NODES = df.loc[df['is_train'], 'node_index'].nunique()

def matrix_string(list: list):
    items_string = [0] * N_NODES
    for item in list:
        items_string[int(item)] = 1
    return items_string

train_user_item_data_list = df.loc[df['is_train'], ['user_index', 'node_index']].groupby('user_index').apply(lambda x: matrix_string(x['node_index'])).tolist()
train_user_item_data = csr_matrix(train_user_item_data_list)

  train_user_item_data_list = df.loc[df['is_train'], ['user_index', 'node_index']].groupby('user_index').apply(lambda x: matrix_string(x['node_index'])).tolist()


In [22]:
def calc_hitrate(df_preds, K):
    print(df_preds[df_preds['rank']<K].groupby('user_index')['relevant'])
    return  df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].max().mean()

def calc_prec(df_preds, K):
    return  (df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].mean()).mean()

In [23]:
import implicit

K = 30

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=16, iterations=20, num_threads=0, random_state=RANDOM_STATE)
# train the model on a sparse matrix of user/item/confidence weights
model.fit(train_user_item_data)

test_users = df_test['user_index'].unique()

preds = model.recommend(test_users, train_user_item_data[test_users].tocsr(), N=K)[0]
df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

df_preds = df_preds.explode(['node_index', 'rank']).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left'
)
df_preds['relevant'] = df_preds['relevant'].fillna(0)

hitrate = calc_hitrate(df_preds, K)
prec = calc_prec(df_preds, K)

with mlflow.start_run(run_name='AlternatingLeastSquares'):
    mlflow.log_metrics(
        {
            'hitrate-at-30': hitrate, 
            'precision-at-30': prec
        },
    )
    mlflow.log_params(
        {
            'factors': model.factors,
            'iterations': model.iterations,
            'num_threads': model.num_threads,
            'random_state': model.random_state
        }
    )

print(hitrate, prec)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x2ac909c10>
0.24587706146926536 0.009545227386306848


## SASREC

In [24]:
# по итогу не смог разобраться в доке библиотеки :pepe-clown:

from recommenders.datasets.amazon_reviews import get_review_data
from recommenders.datasets.split_utils import filter_k_core
from recommenders.models.sasrec.model import SASREC
from recommenders.models.sasrec.ssept import SSEPT
from recommenders.models.sasrec.sampler import WarpSampler
from recommenders.models.sasrec.util import SASRecDataSet
from recommenders.utils.notebook_utils import store_metadata
from recommenders.utils.timer import Timer

In [25]:
num_epochs = 5
batch_size = 128
seed = 100  # Set None for non-deterministic result

lr = 0.001             # learning rate
maxlen = 200            # maximum sequence length for each user
num_blocks = 2         # number of transformer blocks
hidden_units = 100     # number of units in the attention calculation
num_heads = 1          # number of attention heads
dropout_rate = 0.1     # dropout rate
l2_emb = 0.0           # L2 regularization coefficient
num_neg_test = 100     # number of negative examples per positive example

In [26]:
df['is_train'] = df['event_date']< df['event_date'].max() - pd.Timedelta('2 day') # train тот же самый
df['is_val'] = (df['event_date'] >= df['event_date'].max() - pd.Timedelta('3 day')) & df['is_train']
df['is_train'] = df['is_train'] & ~df['is_val']
df['is_test'] = (~df['is_train']) & (~df['is_val'])

print(df['is_train'].sum() + df['is_val'].sum() + df['is_test'].sum() == df.shape[0])
print(df['is_train'].sum(), df['is_val'].sum(), df['is_test'].sum())

True
95084 1527 3333


In [27]:
user_items = dict(df.groupby('user_index')['node_index'].apply(list))

train_user_items = dict(df[df['is_train']].groupby('user_index')['node_index'].apply(list))
val_user_items = dict(df[df['is_val']].groupby('user_index')['node_index'].apply(list))
test_user_items = dict(df[df['is_test']].groupby('user_index')['node_index'].apply(list))

In [28]:
# initiate a dataset class 
# train_data = SASRecDataSet(filename='./train_data.csv', col_sep="\t")
train_data = SASRecDataSet(
    filename='./full_data.csv',
    col_sep='\t',
    usernum=df['user_index'].unique(),
    itemnum=df['node_index'].unique(),
    User=user_items,
    Items=set(df['node_index']),
    user_train=train_user_items,
    user_valid=val_user_items,
    user_test=test_user_items
)

# create train, validation and test splits
train_data.split()

# some statistics
num_steps = int(len(train_data.user_train) / batch_size)
cc = 0.0
for u in train_data.user_train:
    cc += len(train_data.user_train[u])
print('%g Users and %g items' % (train_data.usernum, train_data.itemnum))
print('average sequence length: %.2f' % (cc / len(train_data.user_train)))

1663 Users and 2175 items
average sequence length: 58.28


In [29]:
sampler = WarpSampler(train_data.user_train, train_data.usernum, train_data.itemnum, batch_size=batch_size, maxlen=maxlen, n_workers=3)

In [30]:
model = SASREC(item_num=train_data.itemnum,
                seq_max_len=maxlen,
                num_blocks=num_blocks,
                embedding_dim=hidden_units,
                attention_dim=hidden_units,
                attention_num_heads=num_heads,
                dropout_rate=dropout_rate,
                conv_dims = [100, 100],
                l2_reg=l2_emb,
                num_neg_test=num_neg_test
)

In [31]:
t_test = model.train(train_data, sampler, num_epochs=num_epochs, batch_size=batch_size, lr=lr, val_epoch=6)

                                                                      


epoch: 5, test (NDCG@30: 0.6178284833856107, HR@30: 0.9151515151515152)


