# Лаб-3. Рекомендательные системы

In [4]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import random

# Выбираем девайс
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cuda


In [5]:
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1, new_user_ratings=None):
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")

        # Преобразовываем Id фильмов в индексы в таблице movies
        x = self.movies.loc[:,['movieId']]
        x['movieId'], x.index = x.index, x['movieId'].values
        ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])

        if new_user_ratings:
            new_user_id = ratings['userId'].max() + 1
            new_ratings = pd.DataFrame([
                {
                    'userId': new_user_id,
                    'movieId': movie_idx,
                    'rating': rating
                } for movie_idx, rating in new_user_ratings
            ])
            ratings = pd.concat([ratings, new_ratings], ignore_index=True)

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        return {
            "user": torch.LongTensor([sample['userId']]),
            "movie": torch.LongTensor([sample['movieId']]),
            "rating": torch.FloatTensor([sample['rating']])
        }

def generate_random_ratings(num_movies, num_ratings=20):
    random_movies = random.sample(range(num_movies), num_ratings)
    ratings = [(movie_idx, random.uniform(1, 5)) for movie_idx in random_movies]
    return ratings

def suggest_movies(model, user_id, movies_df, suggestions_count=10):
    model.eval()
    with torch.no_grad():
        all_movie_ids = torch.arange(len(movies_df), dtype=torch.long).to(device)
        user_tensor = torch.LongTensor([user_id] * len(all_movie_ids)).to(device)
        predictions = model({"user": user_tensor.unsqueeze(1), "movie": all_movie_ids.unsqueeze(1)})
        predictions = predictions.squeeze(1)
        recommended_ids = predictions.argsort(descending=True)[:suggestions_count]
        return movies_df.iloc[recommended_ids.cpu().numpy()]

In [6]:
BATCH_SIZE = 200
DATASET_SOURCE = r'./data'
MOCK_RATINGS_COUNT = 20

mock_ratings = generate_random_ratings(MOCK_RATINGS_COUNT)

movielens_train = MovielensDataset(DATASET_SOURCE, train=True, new_user_ratings=mock_ratings)
movielens_test  = MovielensDataset(DATASET_SOURCE, train=False)

train_loader = DataLoader(movielens_train, BATCH_SIZE, True)
test_loader = DataLoader(movielens_test, BATCH_SIZE, True)

for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

user torch.Size([200, 1])
movie torch.Size([200, 1])
rating torch.Size([200, 1])


In [7]:
# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * BATCH_SIZE
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [8]:
class DeepFM(nn.Module):
    def __init__(self):
        super().__init__()
        self.user_embeddings = nn.Embedding(1000, 32)
        self.movie_embeddings = nn.Embedding(10000, 32)

        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),  # for regularization
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
        )

        self.final_layer = nn.Linear(32 + 16, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.user_embeddings(batch['user']).squeeze(1)
        user_emb = self.movie_embeddings(batch['movie']).squeeze(1)

        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 10
LEARNING_RATE = 1e-3

deep_mf_model = DeepFM().to(device)
deep_mf_loss = nn.MSELoss()
deep_mf_optimizer = torch.optim.Adam(deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, deep_mf_model, deep_mf_loss, deep_mf_optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 2.227392  [  200/80685]
loss: 1.231403  [20200/80685]
loss: 0.996539  [40200/80685]
loss: 0.973332  [60200/80685]
loss: 0.966226  [80200/80685]


 10%|█         | 1/10 [00:04<00:44,  4.95s/it]

Avg loss: 0.992332 

== Epoch 2 ==
loss: 0.923173  [  200/80685]
loss: 1.124682  [20200/80685]
loss: 0.855638  [40200/80685]
loss: 0.977245  [60200/80685]
loss: 0.820266  [80200/80685]


 20%|██        | 2/10 [00:09<00:36,  4.62s/it]

Avg loss: 0.931574 

== Epoch 3 ==
loss: 0.908505  [  200/80685]
loss: 0.892067  [20200/80685]
loss: 0.946081  [40200/80685]
loss: 0.880783  [60200/80685]
loss: 0.985706  [80200/80685]


 30%|███       | 3/10 [00:13<00:32,  4.62s/it]

Avg loss: 0.888152 

== Epoch 4 ==
loss: 0.886115  [  200/80685]
loss: 0.970725  [20200/80685]
loss: 0.876778  [40200/80685]
loss: 0.927692  [60200/80685]
loss: 0.885454  [80200/80685]


 40%|████      | 4/10 [00:18<00:27,  4.54s/it]

Avg loss: 0.860449 

== Epoch 5 ==
loss: 0.692394  [  200/80685]
loss: 0.851198  [20200/80685]
loss: 0.766948  [40200/80685]
loss: 0.843819  [60200/80685]
loss: 0.770755  [80200/80685]


 50%|█████     | 5/10 [00:22<00:22,  4.52s/it]

Avg loss: 0.836700 

== Epoch 6 ==
loss: 0.861831  [  200/80685]
loss: 0.730927  [20200/80685]
loss: 0.783963  [40200/80685]
loss: 0.954893  [60200/80685]
loss: 0.690941  [80200/80685]


 60%|██████    | 6/10 [00:27<00:17,  4.47s/it]

Avg loss: 0.824368 

== Epoch 7 ==
loss: 0.719483  [  200/80685]
loss: 0.688799  [20200/80685]
loss: 0.590382  [40200/80685]
loss: 0.715703  [60200/80685]
loss: 0.650757  [80200/80685]


 70%|███████   | 7/10 [00:31<00:13,  4.43s/it]

Avg loss: 0.809641 

== Epoch 8 ==
loss: 0.771045  [  200/80685]
loss: 0.678787  [20200/80685]
loss: 0.879492  [40200/80685]
loss: 0.719320  [60200/80685]
loss: 0.670103  [80200/80685]


 80%|████████  | 8/10 [00:35<00:08,  4.42s/it]

Avg loss: 0.797578 

== Epoch 9 ==
loss: 0.773797  [  200/80685]
loss: 0.774033  [20200/80685]
loss: 0.957728  [40200/80685]
loss: 0.616723  [60200/80685]
loss: 0.696504  [80200/80685]


 90%|█████████ | 9/10 [00:40<00:04,  4.39s/it]

Avg loss: 0.792528 

== Epoch 10 ==
loss: 0.715858  [  200/80685]
loss: 0.881447  [20200/80685]
loss: 0.697860  [40200/80685]
loss: 0.682960  [60200/80685]
loss: 0.595806  [80200/80685]


100%|██████████| 10/10 [00:44<00:00,  4.46s/it]

Avg loss: 0.785119 






In [11]:
SUGGESTIONS_COUNT = 10

print("Movie Recommendations for me:")
new_user_id = movielens_train.ratings['userId'].max()
suggestions = suggest_movies(deep_mf_model, new_user_id, movielens_train.movies, suggestions_count=10)
suggestions

Movie Recommendations for me:


Unnamed: 0,movieId,title,genres
9575,174551,Obsession (1965),Comedy
6051,40491,"Match Factory Girl, The (Tulitikkutehtaan tytt...",Comedy|Drama
9514,171495,Cosmos,(no genres listed)
8706,124273,Kevin Smith: Too Fat For 40 (2010),Comedy
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
4413,6514,Ring of Terror (1962),Horror
8763,128592,The Boy Next Door (2015),Mystery|Thriller
2740,3678,"Man with the Golden Arm, The (1955)",Drama
461,527,Schindler's List (1993),Drama|War
3043,4077,"With a Friend Like Harry... (Harry, un ami qui...",Drama|Thriller
