# Лаб-3. Рекомендательные системы

In [13]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import random

# Выбираем девайс
USE_CUDA = False
device = "cuda" if USE_CUDA and torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cpu


In [14]:
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1, new_user_ratings=None):
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")

        # Преобразовываем Id фильмов в индексы в таблице movies
        x = self.movies.loc[:,['movieId']]
        x['movieId'], x.index = x.index, x['movieId'].values
        ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])

        if new_user_ratings:
            new_user_id = ratings['userId'].max() + 1
            new_ratings = pd.DataFrame([
                {
                    'userId': new_user_id,
                    'movieId': movie_idx,
                    'rating': rating
                } for movie_idx, rating in new_user_ratings
            ])
            ratings = pd.concat([ratings, new_ratings], ignore_index=True)

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        return {
            "user": torch.LongTensor([sample['userId']]),
            "movie": torch.LongTensor([sample['movieId']]),
            "rating": torch.FloatTensor([sample['rating']])
        }

def generate_random_ratings(num_movies, num_ratings=20):
    random_movies = random.sample(range(num_movies), num_ratings)
    ratings = [(movie_idx, random.uniform(1, 5)) for movie_idx in random_movies]
    return ratings

def suggest_movies(model, user_id, movies_df, suggestions_count=10):
    model.eval()
    with torch.no_grad():
        all_movie_ids = torch.arange(len(movies_df), dtype=torch.long).to(device)
        user_tensor = torch.LongTensor([user_id] * len(all_movie_ids)).to(device)
        predictions = model({"user": user_tensor.unsqueeze(1), "movie": all_movie_ids.unsqueeze(1)})
        predictions = predictions.squeeze(1)
        recommended_ids = predictions.argsort(descending=True)[:suggestions_count]
        return movies_df.iloc[recommended_ids.cpu().numpy()]

In [15]:
BATCH_SIZE = 200
DATASET_SOURCE = r'./data'
MOCK_RATINGS_COUNT = 20

mock_ratings = generate_random_ratings(MOCK_RATINGS_COUNT)
RATINGS = [
    (111, 5.0), # 111,Taxi Driver (1976),Crime|Drama|Thriller
    (55444, 4.5), # 55444,Control (2007),Drama
    (88129, 5.0), # 88129,Drive (2011),Crime|Drama|Film-Noir|Thriller
    (99114, 5.0), # 99114,Django Unchained (2012),Action|Drama|Western
    (27156, 4.5), # 27156,"Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)",Action|Animation|Drama|Fantasy|Sci-Fi
    (47423, 4.0), # 47423,Half Nelson (2006),Drama
    (4306, 5.0), # 4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance
    (8360, 5.0), # 8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Romance
    (53121, 5.0), # 53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy
    (541, 5.0), # 541,Blade Runner (1982),Action|Sci-Fi|Thriller
    (122886,2.0), # 122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
    (5444, 5.0), # 5444,Lilo & Stitch (2002),Adventure|Animation|Children|Sci-Fi
    (171749, 4.0), # 171749,Death Note: Desu nôto (2006–2007),(no genres listed)
    (47, 4.5), # 47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
    (1201, 5.0), # 1201,"Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)",Action|Adventure|Western
    (2951, 5.0), # 2951,"Fistful of Dollars, A (Per un pugno di dollari) (1964)",Action|Western
    (64614, 5.0), # 64614,Gran Torino (2008),Crime|Drama
    (72737, 5.0), # 72737,"Princess and the Frog, The (2009)",Animation|Children|Fantasy|Musical|Romance
    (101525, 3.5), # 101525,"Place Beyond the Pines, The (2012)",Crime|Drama
    (31658, 5.0), # 31658,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Fantasy|Romance
]

movielens_train = MovielensDataset(DATASET_SOURCE, train=True, new_user_ratings=mock_ratings)
movielens_test  = MovielensDataset(DATASET_SOURCE, train=False)

train_loader = DataLoader(movielens_train, BATCH_SIZE, True)
test_loader = DataLoader(movielens_test, BATCH_SIZE, True)

for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

user torch.Size([200, 1])
movie torch.Size([200, 1])
rating torch.Size([200, 1])


In [16]:
# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * BATCH_SIZE
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [17]:
class DeepFM(nn.Module):
    def __init__(self, num_users=1000, num_movies=10000, embed_dim=32):
        super().__init__()
        self.embed_dim = embed_dim
        self.user_embeddings = nn.Embedding(num_users, self.embed_dim)
        self.movie_embeddings = nn.Embedding(num_movies, self.embed_dim)

        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),  # for regularization
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
        )

        self.final_layer = nn.Linear(self.embed_dim + 16, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.user_embeddings(batch['user']).squeeze(1)
        user_emb = self.movie_embeddings(batch['movie']).squeeze(1)

        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 12
LEARNING_RATE = 1e-3

deep_mf_model = DeepFM(
    num_users=movielens_train.ratings['userId'].max() + 1,
    num_movies=len(movielens_train.movies)
).to(device)

deep_mf_loss = nn.MSELoss()
deep_mf_optimizer = torch.optim.Adam(deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, deep_mf_model, deep_mf_loss, deep_mf_optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 2.319873  [  200/80685]
loss: 1.104735  [20200/80685]
loss: 1.090459  [40200/80685]
loss: 0.868956  [60200/80685]
loss: 1.041387  [80200/80685]


 10%|█         | 1/10 [00:04<00:38,  4.25s/it]

Avg loss: 0.992876 

== Epoch 2 ==
loss: 0.950420  [  200/80685]
loss: 0.960867  [20200/80685]
loss: 0.817036  [40200/80685]
loss: 1.056639  [60200/80685]
loss: 0.903974  [80200/80685]


 20%|██        | 2/10 [00:09<00:36,  4.59s/it]

Avg loss: 0.926762 

== Epoch 3 ==
loss: 0.685048  [  200/80685]
loss: 1.097721  [20200/80685]
loss: 0.879893  [40200/80685]
loss: 0.916419  [60200/80685]
loss: 0.959980  [80200/80685]


 30%|███       | 3/10 [00:13<00:31,  4.43s/it]

Avg loss: 0.892357 

== Epoch 4 ==
loss: 0.867665  [  200/80685]
loss: 0.915648  [20200/80685]
loss: 0.877032  [40200/80685]
loss: 0.978197  [60200/80685]
loss: 0.943107  [80200/80685]


 40%|████      | 4/10 [00:17<00:25,  4.30s/it]

Avg loss: 0.857134 

== Epoch 5 ==
loss: 0.787486  [  200/80685]
loss: 0.968856  [20200/80685]
loss: 0.895510  [40200/80685]
loss: 0.829075  [60200/80685]
loss: 0.939290  [80200/80685]


 50%|█████     | 5/10 [00:21<00:21,  4.32s/it]

Avg loss: 0.836722 

== Epoch 6 ==
loss: 0.733167  [  200/80685]
loss: 0.884237  [20200/80685]
loss: 0.807698  [40200/80685]
loss: 0.903801  [60200/80685]
loss: 0.767624  [80200/80685]


 60%|██████    | 6/10 [00:26<00:17,  4.29s/it]

Avg loss: 0.819461 

== Epoch 7 ==
loss: 1.005086  [  200/80685]
loss: 0.593672  [20200/80685]
loss: 0.713389  [40200/80685]
loss: 0.807397  [60200/80685]
loss: 0.805436  [80200/80685]


 70%|███████   | 7/10 [00:30<00:12,  4.22s/it]

Avg loss: 0.805582 

== Epoch 8 ==
loss: 0.748979  [  200/80685]
loss: 0.736821  [20200/80685]
loss: 0.748366  [40200/80685]
loss: 0.754668  [60200/80685]
loss: 0.705801  [80200/80685]


 80%|████████  | 8/10 [00:34<00:08,  4.19s/it]

Avg loss: 0.795675 

== Epoch 9 ==
loss: 0.907880  [  200/80685]
loss: 0.660192  [20200/80685]
loss: 0.674093  [40200/80685]
loss: 0.713338  [60200/80685]
loss: 0.618640  [80200/80685]


 90%|█████████ | 9/10 [00:38<00:04,  4.18s/it]

Avg loss: 0.786795 

== Epoch 10 ==
loss: 0.882472  [  200/80685]
loss: 0.700431  [20200/80685]
loss: 0.724041  [40200/80685]
loss: 0.702267  [60200/80685]
loss: 0.791906  [80200/80685]


100%|██████████| 10/10 [00:42<00:00,  4.26s/it]

Avg loss: 0.791353 






In [18]:
SUGGESTIONS_COUNT = 20

print("Movie Recommendations for me:")
new_user_id = movielens_train.ratings['userId'].max()
suggestions = suggest_movies(deep_mf_model, new_user_id, movielens_train.movies, suggestions_count=SUGGESTIONS_COUNT)
suggestions

Movie Recommendations for me:


Unnamed: 0,movieId,title,genres
3899,5477,Sex and Lucia (Lucía y el sexo) (2001),Drama|Romance
8729,127098,Louis C.K.: Live at The Comedy Store (2015),Comedy
922,1221,"Godfather: Part II, The (1974)",Crime|Drama
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War
7180,72226,Fantastic Mr. Fox (2009),Adventure|Animation|Children|Comedy|Crime
2498,3334,Key Largo (1948),Crime|Drama|Film-Noir|Thriller
8154,102217,Bill Hicks: Revelations (1993),Comedy
585,720,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy
6597,55391,10th & Wolf (2006),Crime|Drama|Thriller
