# Лаб-4. Рекомендации для коротких сессий

In [139]:
import torch
from torch import nn

import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [140]:
IS_CUDA_USED = False
device = "cuda" if torch.cuda.is_available() and IS_CUDA_USED else "cpu"
print(f'Device: {device}')

Device: cpu


In [141]:
# Как и в предыдущей лабораторной пишем собственный загрузчик датасета
class ECommerceDataset:
    def __init__(self, path):
        self.train_data = pd.read_csv(rf"{path}/train_data.csv")
        self.test_data = pd.read_csv(rf"{path}/test_data.csv")

        # Добавляем колонку с идентификаторами товаров (для эмбедингов)
        all_data = pd.concat([self.train_data, self.test_data])
        unique_items = all_data['product_id'].unique()
        item_to_idx = pd.Series(data=np.arange(len(unique_items)), index=unique_items)
        item_map = pd.DataFrame({'product_id': unique_items, 'product_index': item_to_idx[unique_items].values})
        self.train_data = pd.merge(self.train_data, item_map, on='product_id', how='inner')
        self.test_data  = pd.merge(self.test_data,  item_map, on='product_id', how='inner')

        # Сортируем датасет так, чтобы все сессии оказались рядом, а клики внутри сессии упорядочились по времени
        self.train_data.sort_values(['user_session', 'event_time'], inplace=True)
        self.test_data.sort_values(['user_session', 'event_time'], inplace=True)

# Загрузка большого датасета может занять некоторое время
dataset = ECommerceDataset('./eCommerce')

In [142]:
dataset.train_data

Unnamed: 0,event_time,product_id,user_session,product_index
32678,1604329884,80548,003pEktS1X,4865
34407,1607580196,630753,00ImhDtWxv,4292
21963,1607165660,387956,00xjwy5Rb6,8
31665,1607168978,387956,00xjwy5Rb6,8
23220,1611391773,738,00zEpCxZUK,1478
...,...,...,...,...
14766,1613148682,93765,zzaAzAFcYL,3193
15086,1613148695,93765,zzaAzAFcYL,3193
32226,1613408761,564777,zzveLpjyyb,1226
25067,1613409009,564777,zzveLpjyyb,1226


In [143]:
print(
    'Количество уникальных товаров',
    pd.concat([dataset.train_data, dataset.test_data])['product_id'].nunique(),
    '=',
    pd.concat([dataset.train_data, dataset.test_data])['product_index'].max() + 1
)

Количество уникальных товаров 15316 = 15316


In [144]:
class ECommerceLoader():
    def __init__(self, data, batch_size, shuffle=False):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.session_count = data['user_session'].nunique()

        # Делаем массив с индексами начала и конца каждой сессии
        session_sizes = np.array(data.groupby('user_session').size().cumsum())
        self.offsets = np.append([0], session_sizes)

    def __iter__(self):
        session_order = np.arange(self.session_count)
        if self.shuffle:
            np.random.shuffle(session_order)

        # Заводим список активных сессий, размером с батч
        active_sessions = np.arange(self.batch_size)
        next_session = self.batch_size # индекс следующей сессии
        start = self.offsets[session_order[active_sessions]]   # индексы начал активных сессий
        end = self.offsets[session_order[active_sessions] + 1] # индексы концов активных сессий

        closed_mask = list(active_sessions) # список сессий, которые открываются на текущей итерации
    
        while True:
            min_len = (end - start).min() # Количество итераций, которые мы можем пройти, пока не закончится какая-то сессия
            idx_target = self.data['product_index'].values[start]

            # Итерируем по сессиям до тех пор, пока какая-то не закончится
            for i in range(min_len - 1):
                idx_input = idx_target
                idx_target = self.data['product_index'].values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                yield input, target, closed_mask # маску мы будем использовать чтобы обнулять новые сессии
                closed_mask = []

            start = start + (min_len - 1)

            # Пробегаемся по сессиям, которые должны быть завершены
            closed_mask = np.arange(len(active_sessions))[(end - start) <= 1]
            for idx in closed_mask:
                # Если новых сессий нет, просто завершаемся
                if next_session >= len(self.offsets) - 1:
                    return
                # Обновляем значения для новой сессии
                active_sessions[idx] = next_session
                start[idx] = self.offsets[session_order[next_session]]
                end[idx]   = self.offsets[session_order[next_session] + 1]
                next_session += 1

batch_size = 10

train_loader = ECommerceLoader(dataset.train_data, batch_size, shuffle=True)
test_loader  = ECommerceLoader(dataset.test_data, batch_size)

In [145]:
EMBEDDING_SIZE = 64

HIDDEN_SIZE = 64

ITEM_SIZE = 15316

class GRU4Rec(nn.Module):
    def __init__(self):
        super().__init__()

        embedding_size = EMBEDDING_SIZE
        self.hidden_size = HIDDEN_SIZE
        item_size = ITEM_SIZE
        
        self.num_layers = 1
        self.state = torch.zeros([self.num_layers, batch_size, self.hidden_size])
        self.embedding = nn.Embedding(item_size, embedding_size)
        self.gru = nn.GRU(embedding_size, self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.output_layer = nn.Linear(self.hidden_size, item_size)
        self.dropout = nn.Dropout(0.5)

    # Перегрузка to чтобы состояние тоже перевести на девайс
    def to(self, device):
        self.state = self.state.to(device)
        return super().to(device)

    # Обнуляем состояние для новых сессий
    def update_state(self, mask=None):
        self.state.detach_()
        if mask is None:
            # self.state = torch.zeros(
            #     self.num_layers, batch_size, self.hidden_size, device=device
            # )
            self.state[:, :, :] = 0
        else:
            self.state[:, mask, :] = 0

    def forward(self, input):
        self.update_state(mask=None)
        v = input.unsqueeze(1)
        v = self.embedding(v)
        v, self.state = self.gru(v, self.state) # (batch_size, 1, hidden_size)
        hidden = v.squeeze(1) # (batch_size, hidden_size)
        v = self.dropout(hidden)
        v = self.output_layer(v)
        return v

In [146]:
# Тренировка происходит и тестирование

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()

    for batch, (x, y, m) in enumerate(data_loader):
        x, y = x.to(device), y.to(device)
        # Не забываем обнулить состояние
        model.update_state(m)

        pred = model(x)
        loss = loss_function(pred, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(x)
            print(f"loss: {loss:>7f}  [{current:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()

    loss, correct, count = 0, 0 ,0
    with torch.no_grad():
        for x, y, m in data_loader:
            count += 1
            x, y = x.to(device), y.to(device)
            model.update_state(m)
            pred = model(x)
            loss += loss_function(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss = loss / count
    correct /= count * batch_size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {loss:>8f} \n")
    pass


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)

In [147]:
class BPRLoss(nn.Module):
    def __init__(self, default_num_negatives=10):
        super(BPRLoss, self).__init__()
        self.default_num_negatives = default_num_negatives
        self.num_negatives = default_num_negatives

    def forward(self, predictions, ground_truth):
        """
        predictions: [B, N] - predicted scores for all items
        ground_truth: [B] - indices of ground truth items for all sessions in a batch
        """
        batch_size, num_items = predictions.size()

        self.num_negatives = num_items - 1 if USE_ALL_NEGATIVES else self.default_num_negatives
        
        positive_scores = predictions[torch.arange(batch_size), ground_truth]

        # negatives = torch.randint(0, num_items, (batch_size, self.num_negatives), device=predictions.device)
        # negative_scores = predictions.gather(1, negatives)  # [B, num_negatives]
        negative_indices = torch.arange(num_items, device=predictions.device).repeat(batch_size, 1) # [0, 1, ..., num_items - 1] x batch_size -> [batch_size, num_items]
        negative_indices.scatter_(1, ground_truth.unsqueeze(1), -1)  # Mask positives with -1
        negatives = negative_indices[negative_indices != -1].view(batch_size, -1)  # Filter out positives
        sampled_negatives = negatives[:, torch.randint(0, negatives.size(1), (self.num_negatives,), device=predictions.device)]
        negative_scores = predictions.gather(1, sampled_negatives)  # [B, num_negatives]

        diff = positive_scores.unsqueeze(1) - negative_scores  # [B, num_negatives]
        loss = -torch.mean(torch.log(torch.sigmoid(diff)))
        return loss

In [148]:
class TOP1Loss(nn.Module):
    def __init__(self, default_num_negatives=10):
        super(TOP1Loss, self).__init__()
        self.default_num_negatives = default_num_negatives
        self.num_negatives = default_num_negatives

    def forward(self, predictions, ground_truth):
        """
        predictions: [B, N] - predicted scores for all items
        ground_truth: [B] - indices of ground truth items
        """
        batch_size, num_items = predictions.size()

        self.num_negatives = num_items - 1 if USE_ALL_NEGATIVES else self.default_num_negatives

        positive_scores = predictions[torch.arange(batch_size), ground_truth]

        # negatives = torch.randint(0, num_items, (batch_size, self.num_negatives), device=predictions.device)
        # negative_scores = predictions.gather(1, negatives)  # [B, num_negatives]
        negative_indices = torch.arange(num_items, device=predictions.device).repeat(batch_size, 1) # [0, 1, ..., num_items - 1] x batch_size -> [batch_size, num_items]
        negative_indices.scatter_(1, ground_truth.unsqueeze(1), -1)  # Mask positives with -1
        negatives = negative_indices[negative_indices != -1].view(batch_size, -1)  # Filter out positives
        sampled_negatives = negatives[:, torch.randint(0, negatives.size(1), (self.num_negatives,), device=predictions.device)]
        negative_scores = predictions.gather(1, sampled_negatives)  # [B, num_negatives]
        
        rank_term = torch.sigmoid(negative_scores - positive_scores.unsqueeze(1))
        regularization_term = torch.sigmoid(negative_scores**2)

        loss = torch.mean(rank_term + regularization_term)
        return loss

In [149]:
LOSS_FUNCTIONS = [
    nn.CrossEntropyLoss,
    BPRLoss,
    TOP1Loss,
]

LOSS_FUNCTION_SAMPLE_ENABLED = [1, 1, 1]

LEARNING_RATE = 0.01

EPOCHS_COUNT = 10

USE_ALL_NEGATIVES = True

def launch_loss_example(loss_function_idx):
    if not LOSS_FUNCTION_SAMPLE_ENABLED[loss_function_idx]:
        return
    
    loss = LOSS_FUNCTIONS[loss_function_idx]()
    print(loss)
    
    model = GRU4Rec().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    train(EPOCHS_COUNT, model, loss, optimizer)

In [150]:
launch_loss_example(0)

CrossEntropyLoss()


  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 9.591864  [   10]
loss: 6.328127  [10010]


 10%|█         | 1/10 [00:13<02:04, 13.80s/it]

Test Error: 
 Accuracy: 3.2%, Avg loss: 11.856998 

== Epoch 2 ==
loss: 4.491426  [   10]
loss: 4.297050  [10010]


 20%|██        | 2/10 [00:27<01:52, 14.00s/it]

Test Error: 
 Accuracy: 3.5%, Avg loss: 13.022250 

== Epoch 3 ==
loss: 4.433193  [   10]
loss: 3.041124  [10010]


 30%|███       | 3/10 [00:41<01:37, 13.91s/it]

Test Error: 
 Accuracy: 3.5%, Avg loss: 13.884212 

== Epoch 4 ==
loss: 2.473818  [   10]
loss: 1.507738  [10010]


 40%|████      | 4/10 [00:54<01:21, 13.63s/it]

Test Error: 
 Accuracy: 3.7%, Avg loss: 14.632884 

== Epoch 5 ==
loss: 3.302671  [   10]
loss: 2.400944  [10010]


 50%|█████     | 5/10 [01:08<01:07, 13.47s/it]

Test Error: 
 Accuracy: 3.8%, Avg loss: 15.385748 

== Epoch 6 ==
loss: 3.110605  [   10]
loss: 1.825890  [10010]


 60%|██████    | 6/10 [01:21<00:54, 13.58s/it]

Test Error: 
 Accuracy: 3.9%, Avg loss: 16.150297 

== Epoch 7 ==
loss: 3.432283  [   10]
loss: 3.249202  [10010]


 70%|███████   | 7/10 [01:35<00:40, 13.57s/it]

Test Error: 
 Accuracy: 3.9%, Avg loss: 16.839370 

== Epoch 8 ==
loss: 2.345320  [   10]
loss: 3.055256  [10010]


 80%|████████  | 8/10 [01:48<00:26, 13.50s/it]

Test Error: 
 Accuracy: 3.9%, Avg loss: 17.468257 

== Epoch 9 ==
loss: 1.991537  [   10]
loss: 2.085716  [10010]


 90%|█████████ | 9/10 [02:02<00:13, 13.44s/it]

Test Error: 
 Accuracy: 3.9%, Avg loss: 18.008459 

== Epoch 10 ==
loss: 1.632447  [   10]
loss: 1.482001  [10010]


100%|██████████| 10/10 [02:15<00:00, 13.52s/it]

Test Error: 
 Accuracy: 3.8%, Avg loss: 18.402867 






In [151]:
launch_loss_example(1)

BPRLoss()


  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 0.704720  [   10]
loss: 0.178523  [10010]


 10%|█         | 1/10 [00:16<02:25, 16.15s/it]

Test Error: 
 Accuracy: 2.2%, Avg loss: 1.560243 

== Epoch 2 ==
loss: 0.176670  [   10]
loss: 0.057793  [10010]


 20%|██        | 2/10 [00:32<02:07, 15.99s/it]

Test Error: 
 Accuracy: 2.7%, Avg loss: 1.727433 

== Epoch 3 ==
loss: 0.021923  [   10]
loss: 0.017079  [10010]


 30%|███       | 3/10 [00:47<01:50, 15.82s/it]

Test Error: 
 Accuracy: 3.2%, Avg loss: 1.780095 

== Epoch 4 ==
loss: 0.023824  [   10]
loss: 0.049011  [10010]


 40%|████      | 4/10 [01:03<01:34, 15.71s/it]

Test Error: 
 Accuracy: 3.3%, Avg loss: 1.819956 

== Epoch 5 ==
loss: 0.019281  [   10]
loss: 0.003679  [10010]


 50%|█████     | 5/10 [01:19<01:18, 15.77s/it]

Test Error: 
 Accuracy: 3.6%, Avg loss: 1.860985 

== Epoch 6 ==
loss: 0.001717  [   10]
loss: 0.015132  [10010]


 60%|██████    | 6/10 [01:35<01:04, 16.02s/it]

Test Error: 
 Accuracy: 3.5%, Avg loss: 1.774160 

== Epoch 7 ==
loss: 0.006588  [   10]
loss: 0.009558  [10010]


 70%|███████   | 7/10 [01:51<00:47, 15.88s/it]

Test Error: 
 Accuracy: 3.5%, Avg loss: 1.730305 

== Epoch 8 ==
loss: 0.007403  [   10]
loss: 0.000499  [10010]


 80%|████████  | 8/10 [02:06<00:31, 15.79s/it]

Test Error: 
 Accuracy: 3.5%, Avg loss: 1.905481 

== Epoch 9 ==
loss: 0.005632  [   10]
loss: 0.001329  [10010]


 90%|█████████ | 9/10 [02:22<00:15, 15.74s/it]

Test Error: 
 Accuracy: 3.6%, Avg loss: 1.861403 

== Epoch 10 ==
loss: 0.007404  [   10]
loss: 0.002092  [10010]


100%|██████████| 10/10 [02:38<00:00, 15.86s/it]

Test Error: 
 Accuracy: 3.7%, Avg loss: 1.891335 






In [152]:
launch_loss_example(2)

TOP1Loss()


  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 1.007476  [   10]
loss: 0.958120  [10010]


 10%|█         | 1/10 [00:16<02:26, 16.29s/it]

Test Error: 
 Accuracy: 1.7%, Avg loss: 1.026852 

== Epoch 2 ==
loss: 0.584650  [   10]
loss: 0.624467  [10010]


 20%|██        | 2/10 [00:32<02:08, 16.03s/it]

Test Error: 
 Accuracy: 1.6%, Avg loss: 1.043115 

== Epoch 3 ==
loss: 0.607126  [   10]
loss: 0.576760  [10010]


 30%|███       | 3/10 [00:47<01:51, 15.93s/it]

Test Error: 
 Accuracy: 1.1%, Avg loss: 1.040514 

== Epoch 4 ==
loss: 0.595452  [   10]
loss: 0.576782  [10010]


 40%|████      | 4/10 [01:05<01:40, 16.68s/it]

Test Error: 
 Accuracy: 1.6%, Avg loss: 1.037358 

== Epoch 5 ==
loss: 0.586184  [   10]
loss: 0.579469  [10010]


 50%|█████     | 5/10 [01:21<01:21, 16.37s/it]

Test Error: 
 Accuracy: 1.3%, Avg loss: 1.039829 

== Epoch 6 ==
loss: 0.574290  [   10]
loss: 0.633616  [10010]


 60%|██████    | 6/10 [01:37<01:04, 16.19s/it]

Test Error: 
 Accuracy: 1.3%, Avg loss: 1.038239 

== Epoch 7 ==
loss: 0.568078  [   10]
loss: 0.553382  [10010]


 70%|███████   | 7/10 [01:53<00:48, 16.07s/it]

Test Error: 
 Accuracy: 1.4%, Avg loss: 1.039605 

== Epoch 8 ==
loss: 0.588727  [   10]
loss: 0.596080  [10010]


 80%|████████  | 8/10 [02:09<00:32, 16.10s/it]

Test Error: 
 Accuracy: 1.4%, Avg loss: 1.037847 

== Epoch 9 ==
loss: 0.557286  [   10]
loss: 0.584364  [10010]


 90%|█████████ | 9/10 [02:25<00:16, 16.20s/it]

Test Error: 
 Accuracy: 1.4%, Avg loss: 1.036693 

== Epoch 10 ==
loss: 0.570169  [   10]
loss: 0.582326  [10010]


100%|██████████| 10/10 [02:41<00:00, 16.20s/it]

Test Error: 
 Accuracy: 1.5%, Avg loss: 1.038261 






## Задания

Основные:
- Достичь точности в 3.5% на этом датасете - 5 баллов
- На основе GRU4Rec построить модель для датасета из предыдущей лабораторной (Movielens) - 5 баллов

Дополнительные задания:
- Реализовать одну из функций потерь BPR или TOP1 (https://arxiv.org/pdf/1511.06939) - 5 баллов
- Реализовать вторую функцию потерь - 5 баллов


## Полезные ссылки

Полезные ссылки по рекомендательным системам, модели из лекции и не только

- Репозиторий с кучей информации по рекомендательным системам https://github.com/recommenders-team/recommenders
- Рекомендательные системы на основе свёрток https://arxiv.org/pdf/1809.07426
- Sequence-Aware Factorization Machines (машина факторизации для временных последовательностей) https://arxiv.org/pdf/1911.02752

