# Лаб-3. Рекомендательные системы

In [37]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import random

# Выбираем девайс
USE_CUDA = False
device = "cuda" if USE_CUDA and torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cpu


In [38]:
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1, new_user_ratings=None):
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")

        # Преобразовываем Id фильмов в индексы в таблице movies
        # x = self.movies.loc[:,['movieId']]
        # x['movieId'], x.index = x.index, x['movieId'].values
        # ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])
        
        movie_id_map = pd.Series(self.movies.index, index=self.movies['movieId']).to_dict()
        ratings['movieId'] = ratings['movieId'].map(movie_id_map)
        
        if new_user_ratings:
            new_user_id = ratings['userId'].max() + 1
            new_ratings = pd.DataFrame([
                {
                    'userId': new_user_id,
                    'movieId': movie_idx,
                    'rating': rating
                } for movie_idx, rating in new_user_ratings
            ])
            ratings = pd.concat([ratings, new_ratings], ignore_index=True)

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        return {
            "user": torch.LongTensor([sample['userId']]),
            "movie": torch.LongTensor([sample['movieId']]),
            "rating": torch.FloatTensor([sample['rating']])
        }

def generate_random_ratings(num_movies, num_ratings=20):
    random_movies = random.sample(range(num_movies), num_ratings)
    ratings = [(movie_idx, random.uniform(1, 5)) for movie_idx in random_movies]
    return ratings

def suggest_movies(model, user_id, movies_df, suggestions_count=10):
    model.eval()
    with torch.no_grad():
        all_movie_ids = torch.arange(len(movies_df), dtype=torch.long).to(device)
        user_tensor = torch.LongTensor([user_id] * len(all_movie_ids)).to(device)
        predictions = model({"user": user_tensor.unsqueeze(1), "movie": all_movie_ids.unsqueeze(1)})
        predictions = predictions.squeeze(1)
        recommended_ids = predictions.argsort(descending=True)[:suggestions_count]
        return movies_df.iloc[recommended_ids.cpu().numpy()]

In [39]:
BATCH_SIZE = 200
DATASET_SOURCE = r'./data'
MOCK_RATINGS_COUNT = 20

mock_ratings = generate_random_ratings(MOCK_RATINGS_COUNT)
RATINGS = [
    (111, 5.0), # 111,Taxi Driver (1976),Crime|Drama|Thriller
    (55444, 4.5), # 55444,Control (2007),Drama
    (88129, 5.0), # 88129,Drive (2011),Crime|Drama|Film-Noir|Thriller
    (99114, 5.0), # 99114,Django Unchained (2012),Action|Drama|Western
    (27156, 4.5), # 27156,"Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)",Action|Animation|Drama|Fantasy|Sci-Fi
    (47423, 4.0), # 47423,Half Nelson (2006),Drama
    (4306, 5.0), # 4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance
    (8360, 5.0), # 8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Romance
    (53121, 5.0), # 53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy
    (541, 5.0), # 541,Blade Runner (1982),Action|Sci-Fi|Thriller
    (122886,2.0), # 122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
    (5444, 5.0), # 5444,Lilo & Stitch (2002),Adventure|Animation|Children|Sci-Fi
    (171749, 4.0), # 171749,Death Note: Desu nôto (2006–2007),(no genres listed)
    (47, 4.5), # 47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
    (1201, 5.0), # 1201,"Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)",Action|Adventure|Western
    (2951, 5.0), # 2951,"Fistful of Dollars, A (Per un pugno di dollari) (1964)",Action|Western
    (64614, 5.0), # 64614,Gran Torino (2008),Crime|Drama
    (72737, 5.0), # 72737,"Princess and the Frog, The (2009)",Animation|Children|Fantasy|Musical|Romance
    (101525, 3.5), # 101525,"Place Beyond the Pines, The (2012)",Crime|Drama
    (31658, 5.0), # 31658,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Fantasy|Romance
]

movielens_train = MovielensDataset(DATASET_SOURCE, train=True, new_user_ratings=mock_ratings)
movielens_test  = MovielensDataset(DATASET_SOURCE, train=False)

train_loader = DataLoader(movielens_train, BATCH_SIZE, True)
test_loader = DataLoader(movielens_test, BATCH_SIZE, True)

for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

user torch.Size([200, 1])
movie torch.Size([200, 1])
rating torch.Size([200, 1])


In [40]:
# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * BATCH_SIZE
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [41]:
class DeepFM(nn.Module):
    def __init__(self, num_users=1000, num_movies=10000, embed_dim=32):
        super().__init__()
        self.embed_dim = embed_dim
        self.user_embeddings = nn.Embedding(num_users, self.embed_dim)
        self.movie_embeddings = nn.Embedding(num_movies, self.embed_dim)

        self.flatten = nn.Flatten()

        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),  # for regularization
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
        )

        self.final_layer = nn.Linear(self.embed_dim + 16, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.flatten(self.user_embeddings(batch['user']))
        user_emb = self.flatten(self.movie_embeddings(batch['movie']))

        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 13
LEARNING_RATE = 1e-3

deep_mf_model = DeepFM(
    num_users=movielens_train.ratings['userId'].max() + 1,
    num_movies=len(movielens_train.movies)
).to(device)

deep_mf_loss = nn.MSELoss()
deep_mf_optimizer = torch.optim.Adam(deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, deep_mf_model, deep_mf_loss, deep_mf_optimizer)

  0%|          | 0/13 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 2.642320  [  200/80685]
loss: 1.058934  [20200/80685]
loss: 1.260755  [40200/80685]
loss: 1.018093  [60200/80685]
loss: 0.798168  [80200/80685]


  8%|▊         | 1/13 [00:04<00:52,  4.39s/it]

Avg loss: 0.976862 

== Epoch 2 ==
loss: 0.947594  [  200/80685]
loss: 1.014058  [20200/80685]
loss: 1.026422  [40200/80685]
loss: 0.957744  [60200/80685]
loss: 1.018689  [80200/80685]


 15%|█▌        | 2/13 [00:09<00:51,  4.64s/it]

Avg loss: 0.925653 

== Epoch 3 ==
loss: 0.911419  [  200/80685]
loss: 1.021043  [20200/80685]
loss: 1.163471  [40200/80685]
loss: 0.861967  [60200/80685]
loss: 0.933053  [80200/80685]


 23%|██▎       | 3/13 [00:13<00:44,  4.43s/it]

Avg loss: 0.889290 

== Epoch 4 ==
loss: 0.959428  [  200/80685]
loss: 1.027751  [20200/80685]
loss: 0.832486  [40200/80685]
loss: 0.786244  [60200/80685]
loss: 0.842584  [80200/80685]


 31%|███       | 4/13 [00:17<00:38,  4.29s/it]

Avg loss: 0.860662 

== Epoch 5 ==
loss: 0.811056  [  200/80685]
loss: 0.987389  [20200/80685]
loss: 0.905670  [40200/80685]
loss: 0.775031  [60200/80685]
loss: 0.693482  [80200/80685]


 38%|███▊      | 5/13 [00:21<00:33,  4.23s/it]

Avg loss: 0.839266 

== Epoch 6 ==
loss: 0.784071  [  200/80685]
loss: 0.811723  [20200/80685]
loss: 0.821504  [40200/80685]
loss: 0.757574  [60200/80685]
loss: 0.889614  [80200/80685]


 46%|████▌     | 6/13 [00:25<00:29,  4.20s/it]

Avg loss: 0.816364 

== Epoch 7 ==
loss: 0.696388  [  200/80685]
loss: 0.749600  [20200/80685]
loss: 0.778311  [40200/80685]
loss: 0.878067  [60200/80685]
loss: 0.792375  [80200/80685]


 54%|█████▍    | 7/13 [00:29<00:25,  4.20s/it]

Avg loss: 0.803574 

== Epoch 8 ==
loss: 0.719129  [  200/80685]
loss: 0.677569  [20200/80685]
loss: 0.855662  [40200/80685]
loss: 0.810840  [60200/80685]
loss: 0.834628  [80200/80685]


 62%|██████▏   | 8/13 [00:34<00:21,  4.23s/it]

Avg loss: 0.790108 

== Epoch 9 ==
loss: 0.780617  [  200/80685]
loss: 0.722388  [20200/80685]
loss: 0.794400  [40200/80685]
loss: 0.766459  [60200/80685]
loss: 0.703725  [80200/80685]


 69%|██████▉   | 9/13 [00:38<00:16,  4.24s/it]

Avg loss: 0.799272 

== Epoch 10 ==
loss: 0.601091  [  200/80685]
loss: 0.631220  [20200/80685]
loss: 0.766165  [40200/80685]
loss: 0.733948  [60200/80685]
loss: 0.646526  [80200/80685]


 77%|███████▋  | 10/13 [00:42<00:12,  4.21s/it]

Avg loss: 0.789789 

== Epoch 11 ==
loss: 0.537196  [  200/80685]
loss: 0.811058  [20200/80685]
loss: 0.645603  [40200/80685]
loss: 0.571829  [60200/80685]
loss: 0.622958  [80200/80685]


 85%|████████▍ | 11/13 [00:46<00:08,  4.21s/it]

Avg loss: 0.784290 

== Epoch 12 ==
loss: 0.570571  [  200/80685]
loss: 0.637923  [20200/80685]
loss: 0.601875  [40200/80685]
loss: 0.654006  [60200/80685]
loss: 0.761951  [80200/80685]


 92%|█████████▏| 12/13 [00:51<00:04,  4.22s/it]

Avg loss: 0.789787 

== Epoch 13 ==
loss: 0.647972  [  200/80685]
loss: 0.696590  [20200/80685]
loss: 0.600250  [40200/80685]
loss: 0.661636  [60200/80685]
loss: 0.650217  [80200/80685]


100%|██████████| 13/13 [00:55<00:00,  4.26s/it]

Avg loss: 0.769187 






In [42]:
SUGGESTIONS_COUNT = 20

print("Movie Recommendations for me:")
new_user_id = movielens_train.ratings['userId'].max()
suggestions = suggest_movies(deep_mf_model, new_user_id, movielens_train.movies, suggestions_count=SUGGESTIONS_COUNT)
suggestions

Movie Recommendations for me:


Unnamed: 0,movieId,title,genres
9711,187717,Won't You Be My Neighbor? (2018),Documentary
9649,180095,Wonder (2017),Drama
7402,80083,Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Or...,Action|Adventure|Animation|Fantasy|Sci-Fi
8878,134109,Radio Day (2008),Comedy
9236,153070,Rabbits (2002),Comedy|Drama|Fantasy
7521,84273,Zeitgeist: Moving Forward (2011),Documentary
9152,147410,A Perfect Day (2015),Comedy|Drama
3782,5288,"Atomic Cafe, The (1982)",Documentary|War
9560,173351,Wow! A Talking Fish! (1983),Animation|Children|Comedy|Fantasy
8877,134095,My Love (2006),Animation|Drama
