# Лаб-3. Рекомендательные системы

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd

# Выбираем девайс
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cuda


In [2]:
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1):
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")

        # Преобразовываем Id фильмов в индексы в таблице movies
        x = self.movies.loc[:,['movieId']]
        x['movieId'], x.index = x.index, x['movieId'].values
        ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        return {
            "user": torch.LongTensor([sample['userId']]),
            "movie": torch.LongTensor([sample['movieId']]),
            "rating": torch.FloatTensor([sample['rating']])
        }


batch_size = 200

sataset_source = r'./data'

movielens_train = MovielensDataset(sataset_source, train=True)
movielens_test  = MovielensDataset(sataset_source, train=False)

train_loader = DataLoader(movielens_train, batch_size, True)
test_loader = DataLoader(movielens_test, batch_size, True)

In [3]:
for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

user torch.Size([200, 1])
movie torch.Size([200, 1])
rating torch.Size([200, 1])


In [4]:

# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * batch_size
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [5]:
class DeepFM(nn.Module):
    def __init__(self):
        super().__init__()
        self.user_embeddings = nn.Embedding(1000, 32)
        self.movie_embeddings = nn.Embedding(10000, 32)

        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),  # for regularization
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
        )

        self.final_layer = nn.Linear(32 + 16, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.user_embeddings(batch['user']).squeeze(1)
        user_emb = self.movie_embeddings(batch['movie']).squeeze(1)

        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 10
LEARNING_RATE = 1e-3

improved_deep_mf_model = DeepFM().to(device)
improved_deep_mf_loss = nn.MSELoss()
improved_deep_mf_optimizer = torch.optim.Adam(improved_deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, improved_deep_mf_model, improved_deep_mf_loss, improved_deep_mf_optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 2.502452  [  200/80669]
loss: 0.934487  [20200/80669]
loss: 0.952053  [40200/80669]
loss: 1.158852  [60200/80669]
loss: 1.202772  [80200/80669]


 10%|█         | 1/10 [00:04<00:42,  4.76s/it]

Avg loss: 0.996924 

== Epoch 2 ==
loss: 1.040845  [  200/80669]
loss: 0.897625  [20200/80669]
loss: 0.829274  [40200/80669]
loss: 0.861064  [60200/80669]
loss: 0.786841  [80200/80669]


 20%|██        | 2/10 [00:09<00:37,  4.71s/it]

Avg loss: 0.930083 

== Epoch 3 ==
loss: 0.829671  [  200/80669]
loss: 1.031327  [20200/80669]
loss: 0.857903  [40200/80669]
loss: 0.896231  [60200/80669]
loss: 0.892489  [80200/80669]


 30%|███       | 3/10 [00:13<00:30,  4.37s/it]

Avg loss: 0.887039 

== Epoch 4 ==
loss: 0.831093  [  200/80669]
loss: 0.886675  [20200/80669]
loss: 0.794640  [40200/80669]
loss: 0.925615  [60200/80669]
loss: 0.808148  [80200/80669]


 40%|████      | 4/10 [00:17<00:25,  4.24s/it]

Avg loss: 0.858783 

== Epoch 5 ==
loss: 0.976848  [  200/80669]
loss: 0.999120  [20200/80669]
loss: 0.820710  [40200/80669]
loss: 0.764270  [60200/80669]
loss: 0.888101  [80200/80669]


 50%|█████     | 5/10 [00:23<00:24,  4.92s/it]

Avg loss: 0.837335 

== Epoch 6 ==
loss: 0.749072  [  200/80669]
loss: 0.663707  [20200/80669]
loss: 0.806753  [40200/80669]
loss: 0.788210  [60200/80669]
loss: 0.703555  [80200/80669]


 60%|██████    | 6/10 [00:29<00:21,  5.26s/it]

Avg loss: 0.830236 

== Epoch 7 ==
loss: 0.746272  [  200/80669]
loss: 0.613690  [20200/80669]
loss: 0.701744  [40200/80669]
loss: 0.882177  [60200/80669]
loss: 0.688932  [80200/80669]


 70%|███████   | 7/10 [00:35<00:16,  5.39s/it]

Avg loss: 0.810843 

== Epoch 8 ==
loss: 0.836210  [  200/80669]
loss: 0.574685  [20200/80669]
loss: 0.574392  [40200/80669]
loss: 0.702871  [60200/80669]
loss: 0.779107  [80200/80669]


 80%|████████  | 8/10 [00:41<00:11,  5.60s/it]

Avg loss: 0.812486 

== Epoch 9 ==
loss: 0.669574  [  200/80669]
loss: 0.802772  [20200/80669]
loss: 0.546407  [40200/80669]
loss: 0.644585  [60200/80669]
loss: 0.698031  [80200/80669]


 90%|█████████ | 9/10 [00:46<00:05,  5.65s/it]

Avg loss: 0.801277 

== Epoch 10 ==
loss: 0.743206  [  200/80669]
loss: 0.746032  [20200/80669]
loss: 0.842624  [40200/80669]
loss: 0.685301  [60200/80669]
loss: 0.601043  [80200/80669]


100%|██████████| 10/10 [00:52<00:00,  5.30s/it]

Avg loss: 0.793958 




