# Лаб-4. Рекомендации для коротких сессий

In [19]:
import torch
from torch import nn

import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [20]:
IS_CUDA_USED = False
device = "cuda" if torch.cuda.is_available() and IS_CUDA_USED else "cpu"
print(f'Device: {device}')

Device: cpu


In [21]:
EMBEDDING_SIZE = 64

HIDDEN_SIZE = 64

ITEM_SIZE = 15316

class GRU4Rec(nn.Module):
    def __init__(self):
        super().__init__()

        embedding_size = EMBEDDING_SIZE
        self.hidden_size = HIDDEN_SIZE
        item_size = ITEM_SIZE
        
        self.num_layers = 1
        self.state = torch.zeros([self.num_layers, batch_size, self.hidden_size])
        self.embedding = nn.Embedding(item_size, embedding_size)
        self.gru = nn.GRU(embedding_size, self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.output_layer = nn.Linear(self.hidden_size, item_size)
        self.dropout = nn.Dropout(0.5)

    # Перегрузка to чтобы состояние тоже перевести на девайс
    def to(self, device):
        self.state = self.state.to(device)
        return super().to(device)

    # Обнуляем состояние для новых сессий
    def update_state(self, mask=None):
        self.state.detach_()
        if mask is None:
            self.state = torch.zeros(
                self.num_layers, batch_size, self.hidden_size, device=device
            )
        else:
            self.state[:, mask, :] = 0

    def forward(self, input):
        self.update_state(mask=None)
        v = input.unsqueeze(1)
        v = self.embedding(v)
        v, self.state = self.gru(v, self.state) # (batch_size, 1, hidden_size)
        hidden = v.squeeze(1) # (batch_size, hidden_size)
        v = self.dropout(hidden)
        v = self.output_layer(v)
        return v

In [22]:
# Тренировка происходит и тестирование

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()

    for batch, (x, y, m) in enumerate(data_loader):
        x, y = x.to(device), y.to(device)
        # Не забываем обнулить состояние
        model.update_state(m)

        pred = model(x)
        loss = loss_function(pred, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(x)
            print(f"loss: {loss:>7f}  [{current:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()

    loss, correct, count = 0, 0 ,0
    with torch.no_grad():
        for x, y, m in data_loader:
            count += 1
            x, y = x.to(device), y.to(device)
            model.update_state(m)
            pred = model(x)
            loss += loss_function(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss = loss / count
    correct /= count * batch_size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {loss:>8f} \n")
    pass


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)

In [23]:
class MovieLensDatasetRaw:
    def __init__(self, path):
        ratings = pd.read_csv(rf'{path}/ratings.csv')

        self.train_data = ratings.sample(frac=0.8, random_state=1)
        self.test_data  = ratings.drop(self.train_data.index)
        self.train_data.reset_index(drop=True, inplace=True)
        self.test_data.reset_index(drop=True, inplace=True)

        all_data = pd.concat([self.train_data, self.test_data])
        unique_items = all_data['movieId'].unique()
        item_to_idx = pd.Series(data=np.arange(len(unique_items)), index=unique_items)
        item_map = pd.DataFrame({'movieId': unique_items, 'movieIndex': item_to_idx[unique_items].values})
        self.train_data = pd.merge(self.train_data, item_map, on='movieId', how='inner')
        self.test_data  = pd.merge(self.test_data,  item_map, on='movieId', how='inner')

        # Сортируем датасет так, чтобы все сессии оказались рядом, а клики внутри сессии упорядочились по времени
        self.train_data.sort_values(['userId', 'timestamp'], inplace=True)
        self.test_data.sort_values(['userId', 'timestamp'], inplace=True)

dataset = MovieLensDatasetRaw("MovieLens")

In [24]:
dataset.train_data

Unnamed: 0,userId,movieId,rating,timestamp,movieIndex
76507,1,804,4.0,964980499,4045
15408,1,2826,4.0,964980523,1915
18151,1,2628,4.0,964980523,383
46045,1,3578,5.0,964980668,302
64921,1,3617,4.0,964980683,1085
...,...,...,...,...,...
15699,610,101739,3.5,1495959269,4584
80395,610,70,4.0,1495959282,887
48900,610,328,3.5,1495959299,222
30316,610,2459,3.5,1495959405,2238


In [25]:
print(
    'Количество уникальных фильмов',
    pd.concat([dataset.train_data, dataset.test_data])['movieId'].nunique(),
    '=',
    pd.concat([dataset.train_data, dataset.test_data])['movieIndex'].max() + 1
)

Количество уникальных фильмов 9724 = 9724


In [26]:
class MovieLensLoader():
    def __init__(self, data, batch_size, shuffle=False):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.session_count = data['userId'].nunique()

        # Делаем массив с индексами начала и конца каждой сессии
        session_sizes = np.array(data.groupby('userId').size().cumsum())
        self.offsets = np.append([0], session_sizes)

    def __iter__(self):
        session_order = np.arange(self.session_count)
        if self.shuffle:
            np.random.shuffle(session_order)

        # Заводим список активных сессий, размером с батч
        active_sessions = np.arange(self.batch_size)
        next_session = self.batch_size # индекс следующей сессии
        start = self.offsets[session_order[active_sessions]]   # индексы начал активных сессий
        end = self.offsets[session_order[active_sessions] + 1] # индексы концов активных сессий

        closed_mask = list(active_sessions) # список сессий, которые открываются на текущей итерации
        while True:
            min_len = (end - start).min() # Количество итераций, которые мы можем пройти, пока не закончится какая-то сессия
            idx_target = self.data['movieIndex'].values[start]

            # Итерируем по сессиям до тех пор, пока какая-то не закончится
            for i in range(min_len - 1):
                idx_input = idx_target
                idx_target = self.data['movieIndex'].values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                yield input, target, closed_mask # маску мы будем использовать чтобы обнулять новые сессии
                closed_mask = []

            start = start + (min_len - 1)

            # Пробегаемся по сессиям, которые должны быть завершены
            closed_mask = np.arange(len(active_sessions))[(end - start) <= 1]
            for idx in closed_mask:
                # Если новых сессий нет, просто завершаемся
                if next_session >= len(self.offsets) - 1:
                    return
                # Обновляем значения для новой сессии
                active_sessions[idx] = next_session
                start[idx] = self.offsets[session_order[next_session]]
                end[idx]   = self.offsets[session_order[next_session] + 1]
                next_session += 1

batch_size = 10

train_loader = MovieLensLoader(dataset.train_data, batch_size, shuffle=True)
test_loader  = MovieLensLoader(dataset.test_data, batch_size)

In [27]:

LEARNING_RATE = 0.001
    
model = GRU4Rec().to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
train(10, model, loss, optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 9.260013  [   10]
loss: 7.897375  [10010]
loss: 8.394053  [20010]
loss: 8.659293  [30010]
loss: 8.703279  [40010]
loss: 8.545830  [50010]
loss: 8.346582  [60010]
loss: 7.714411  [70010]


 10%|█         | 1/10 [00:23<03:27, 23.02s/it]

Test Error: 
 Accuracy: 0.3%, Avg loss: 8.233953 

== Epoch 2 ==
loss: 6.773754  [   10]
loss: 8.013899  [10010]
loss: 8.274556  [20010]
loss: 7.744389  [30010]
loss: 8.497782  [40010]
loss: 8.191710  [50010]
loss: 7.830641  [60010]
loss: 8.172770  [70010]


 20%|██        | 2/10 [00:45<03:01, 22.70s/it]

Test Error: 
 Accuracy: 0.4%, Avg loss: 8.290567 

== Epoch 3 ==
loss: 6.524554  [   10]
loss: 7.892291  [10010]
loss: 8.287162  [20010]
loss: 8.018817  [30010]
loss: 7.835461  [40010]
loss: 7.845201  [50010]
loss: 8.683865  [60010]
loss: 7.685362  [70010]


 30%|███       | 3/10 [01:08<02:39, 22.81s/it]

Test Error: 
 Accuracy: 0.4%, Avg loss: 8.368523 

== Epoch 4 ==
loss: 7.443153  [   10]
loss: 8.470109  [10010]
loss: 8.630199  [20010]
loss: 8.868936  [30010]
loss: 7.242331  [40010]
loss: 7.936667  [50010]
loss: 8.287271  [60010]
loss: 7.318661  [70010]


 40%|████      | 4/10 [01:29<02:13, 22.21s/it]

Test Error: 
 Accuracy: 0.5%, Avg loss: 8.384969 

== Epoch 5 ==
loss: 6.068664  [   10]
loss: 7.029361  [10010]
loss: 8.007060  [20010]
loss: 8.840477  [30010]
loss: 8.329036  [40010]
loss: 8.036967  [50010]
loss: 7.782338  [60010]
loss: 7.679163  [70010]


 50%|█████     | 5/10 [01:51<01:49, 21.99s/it]

Test Error: 
 Accuracy: 0.4%, Avg loss: 8.412182 

== Epoch 6 ==
loss: 6.683441  [   10]
loss: 6.529668  [10010]
loss: 7.544769  [20010]
loss: 7.999526  [30010]
loss: 8.679219  [40010]
loss: 8.722351  [50010]
loss: 8.048471  [60010]
loss: 8.301979  [70010]


 60%|██████    | 6/10 [02:12<01:27, 21.77s/it]

Test Error: 
 Accuracy: 0.4%, Avg loss: 8.419328 

== Epoch 7 ==
loss: 5.798939  [   10]
loss: 7.791919  [10010]
loss: 6.831003  [20010]
loss: 7.163840  [30010]
loss: 8.136454  [40010]
loss: 8.264112  [50010]
loss: 8.152093  [60010]
loss: 7.886177  [70010]


 70%|███████   | 7/10 [02:34<01:04, 21.64s/it]

Test Error: 
 Accuracy: 0.5%, Avg loss: 8.464479 

== Epoch 8 ==
loss: 4.729800  [   10]
loss: 8.023566  [10010]
loss: 7.901656  [20010]
loss: 7.061580  [30010]
loss: 8.733491  [40010]
loss: 7.416753  [50010]
loss: 7.926496  [60010]
loss: 8.002831  [70010]


 80%|████████  | 8/10 [02:55<00:43, 21.63s/it]

Test Error: 
 Accuracy: 0.6%, Avg loss: 8.485779 

== Epoch 9 ==
loss: 5.711685  [   10]
loss: 7.916479  [10010]
loss: 6.480213  [20010]
loss: 8.370117  [30010]
loss: 7.667938  [40010]
loss: 7.350478  [50010]
loss: 7.565291  [60010]
loss: 9.156727  [70010]


 90%|█████████ | 9/10 [03:17<00:21, 21.76s/it]

Test Error: 
 Accuracy: 0.5%, Avg loss: 8.509403 

== Epoch 10 ==
loss: 6.297974  [   10]
loss: 7.222162  [10010]
loss: 6.498957  [20010]
loss: 7.230359  [30010]
loss: 7.261389  [40010]
loss: 8.384476  [50010]
loss: 7.459121  [60010]
loss: 8.376918  [70010]


100%|██████████| 10/10 [03:38<00:00, 21.86s/it]

Test Error: 
 Accuracy: 0.5%, Avg loss: 8.537762 




