# Лаб-3. Рекомендательные системы

In [57]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import random

# Выбираем девайс
USE_CUDA = False
device = "cuda" if USE_CUDA and torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cpu


In [58]:
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1, new_user_ratings=None):
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")

        # Преобразовываем Id фильмов в индексы в таблице movies
        # x = self.movies.loc[:,['movieId']]
        # x['movieId'], x.index = x.index, x['movieId'].values
        # ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])
        
        movie_id_map = pd.Series(self.movies.index, index=self.movies['movieId']).to_dict()
        ratings['movieId'] = ratings['movieId'].map(movie_id_map)
        
        if new_user_ratings:
            new_user_id = ratings['userId'].max() + 1
            new_ratings = pd.DataFrame([
                {
                    'userId': new_user_id,
                    'movieId': movie_idx,
                    'rating': rating
                } for movie_idx, rating in new_user_ratings
            ])
            ratings = pd.concat([ratings, new_ratings], ignore_index=True)

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        return {
            "user": torch.LongTensor([sample['userId']]),
            "movie": torch.LongTensor([sample['movieId']]),
            "rating": torch.FloatTensor([sample['rating']])
        }

def generate_random_ratings(num_movies, num_ratings=20):
    random_movies = random.sample(range(num_movies), num_ratings)
    ratings = [(movie_idx, random.uniform(1, 5)) for movie_idx in random_movies]
    return ratings

def suggest_movies(model, user_id, movies_df, suggestions_count=10):
    model.eval()
    with torch.no_grad():
        all_movie_ids = torch.arange(len(movies_df), dtype=torch.long).to(device)
        user_tensor = torch.LongTensor([user_id] * len(all_movie_ids)).to(device)
        predictions = model({"user": user_tensor.unsqueeze(1), "movie": all_movie_ids.unsqueeze(1)})
        predictions = predictions.squeeze(1)
        recommended_ids = predictions.argsort(descending=True)[:suggestions_count]
        return movies_df.iloc[recommended_ids.cpu().numpy()]

In [59]:
BATCH_SIZE = 200
DATASET_SOURCE = r'./data'
MOCK_RATINGS_COUNT = 20

mock_ratings = generate_random_ratings(MOCK_RATINGS_COUNT)
RATINGS = [
    (111, 5.0), # 111,Taxi Driver (1976),Crime|Drama|Thriller
    (55444, 4.5), # 55444,Control (2007),Drama
    (88129, 5.0), # 88129,Drive (2011),Crime|Drama|Film-Noir|Thriller
    (99114, 5.0), # 99114,Django Unchained (2012),Action|Drama|Western
    (27156, 4.5), # 27156,"Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)",Action|Animation|Drama|Fantasy|Sci-Fi
    (47423, 4.0), # 47423,Half Nelson (2006),Drama
    (4306, 5.0), # 4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance
    (8360, 5.0), # 8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Romance
    (53121, 5.0), # 53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy
    (541, 5.0), # 541,Blade Runner (1982),Action|Sci-Fi|Thriller
    (122886,2.0), # 122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
    (5444, 5.0), # 5444,Lilo & Stitch (2002),Adventure|Animation|Children|Sci-Fi
    (171749, 4.0), # 171749,Death Note: Desu nôto (2006–2007),(no genres listed)
    (47, 4.5), # 47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
    (1201, 5.0), # 1201,"Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)",Action|Adventure|Western
    (2951, 5.0), # 2951,"Fistful of Dollars, A (Per un pugno di dollari) (1964)",Action|Western
    (64614, 5.0), # 64614,Gran Torino (2008),Crime|Drama
    (72737, 5.0), # 72737,"Princess and the Frog, The (2009)",Animation|Children|Fantasy|Musical|Romance
    (101525, 3.5), # 101525,"Place Beyond the Pines, The (2012)",Crime|Drama
    (31658, 5.0), # 31658,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Fantasy|Romance
]

movielens_train = MovielensDataset(DATASET_SOURCE, train=True, new_user_ratings=mock_ratings)
movielens_test  = MovielensDataset(DATASET_SOURCE, train=False)

train_loader = DataLoader(movielens_train, BATCH_SIZE, True)
test_loader = DataLoader(movielens_test, BATCH_SIZE, True)

for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

user torch.Size([200, 1])
movie torch.Size([200, 1])
rating torch.Size([200, 1])


In [60]:
# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * BATCH_SIZE
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [61]:
sum([32,32])

64

In [62]:
class DeepFM(nn.Module):
    def __init__(self, num_users=1000, num_movies=10000):
        super().__init__()
       
        self.embeddings_dim = [32, 32]
        self.fm_dim = self.embeddings_dim[0]
        
        self.user_embeddings = nn.Embedding(num_users, self.embeddings_dim[0])
        self.movie_embeddings = nn.Embedding(num_movies, self.embeddings_dim[1])

        self.deep_input_dim = sum(self.embeddings_dim)
        self.deep_linear_dim = 128
        self.deep_output_dim = 128
        
        self.flatten = nn.Flatten()
        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.deep_input_dim, self.deep_linear_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(self.deep_linear_dim, self.deep_linear_dim),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(self.deep_linear_dim, self.deep_output_dim),
            nn.ReLU(),
            nn.Dropout(0.7),
        )

        self.final_layer = nn.Linear(self.deep_output_dim + self.fm_dim, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.flatten(self.user_embeddings(batch['user']))
        user_emb = self.flatten(self.movie_embeddings(batch['movie']))

        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 12
LEARNING_RATE = 1e-3

deep_mf_model = DeepFM(
    num_users=movielens_train.ratings['userId'].max() + 1,
    num_movies=len(movielens_train.movies)
).to(device)

deep_mf_loss = nn.MSELoss()
deep_mf_optimizer = torch.optim.Adam(deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, deep_mf_model, deep_mf_loss, deep_mf_optimizer)

  0%|          | 0/12 [00:00<?, ?it/s]

== Epoch 1 ==
loss: 2.482069  [  200/80685]
loss: 1.103876  [20200/80685]
loss: 0.918766  [40200/80685]
loss: 1.127464  [60200/80685]
loss: 1.067473  [80200/80685]


  8%|▊         | 1/12 [00:04<00:54,  4.98s/it]

Avg loss: 1.039148 

== Epoch 2 ==
loss: 0.904474  [  200/80685]
loss: 1.079380  [20200/80685]
loss: 1.114872  [40200/80685]
loss: 1.002349  [60200/80685]
loss: 0.850086  [80200/80685]


 17%|█▋        | 2/12 [00:09<00:45,  4.59s/it]

Avg loss: 0.951010 

== Epoch 3 ==
loss: 1.010133  [  200/80685]
loss: 0.770265  [20200/80685]
loss: 0.830922  [40200/80685]
loss: 1.105410  [60200/80685]
loss: 0.904671  [80200/80685]


 25%|██▌       | 3/12 [00:14<00:42,  4.74s/it]

Avg loss: 0.905547 

== Epoch 4 ==
loss: 0.759846  [  200/80685]
loss: 0.716846  [20200/80685]
loss: 0.887310  [40200/80685]
loss: 0.806256  [60200/80685]
loss: 0.945911  [80200/80685]


 33%|███▎      | 4/12 [00:18<00:36,  4.57s/it]

Avg loss: 0.889341 

== Epoch 5 ==
loss: 0.835617  [  200/80685]
loss: 0.784559  [20200/80685]
loss: 0.825776  [40200/80685]
loss: 0.868418  [60200/80685]
loss: 0.858549  [80200/80685]


 42%|████▏     | 5/12 [00:22<00:31,  4.45s/it]

Avg loss: 0.852828 

== Epoch 6 ==
loss: 0.878177  [  200/80685]
loss: 0.763795  [20200/80685]
loss: 0.840801  [40200/80685]
loss: 0.769634  [60200/80685]
loss: 0.825126  [80200/80685]


 50%|█████     | 6/12 [00:27<00:27,  4.53s/it]

Avg loss: 0.830706 

== Epoch 7 ==
loss: 0.866474  [  200/80685]
loss: 0.807480  [20200/80685]
loss: 0.770772  [40200/80685]
loss: 0.706116  [60200/80685]
loss: 0.850535  [80200/80685]


 58%|█████▊    | 7/12 [00:32<00:23,  4.68s/it]

Avg loss: 0.816970 

== Epoch 8 ==
loss: 0.665293  [  200/80685]
loss: 0.722152  [20200/80685]
loss: 0.619567  [40200/80685]
loss: 0.896103  [60200/80685]
loss: 0.758516  [80200/80685]


 67%|██████▋   | 8/12 [00:36<00:18,  4.62s/it]

Avg loss: 0.801850 

== Epoch 9 ==
loss: 0.814984  [  200/80685]
loss: 0.854755  [20200/80685]
loss: 0.951071  [40200/80685]
loss: 0.724962  [60200/80685]
loss: 0.763381  [80200/80685]


 75%|███████▌  | 9/12 [00:41<00:13,  4.63s/it]

Avg loss: 0.809219 

== Epoch 10 ==
loss: 0.809213  [  200/80685]
loss: 0.682481  [20200/80685]
loss: 0.690941  [40200/80685]
loss: 0.687914  [60200/80685]
loss: 0.959663  [80200/80685]


 83%|████████▎ | 10/12 [00:45<00:09,  4.54s/it]

Avg loss: 0.795539 

== Epoch 11 ==
loss: 0.846716  [  200/80685]
loss: 0.782209  [20200/80685]
loss: 0.698917  [40200/80685]
loss: 0.885329  [60200/80685]
loss: 0.719368  [80200/80685]


 92%|█████████▏| 11/12 [00:50<00:04,  4.49s/it]

Avg loss: 0.784551 

== Epoch 12 ==
loss: 0.638244  [  200/80685]
loss: 0.745285  [20200/80685]
loss: 0.900345  [40200/80685]
loss: 0.729906  [60200/80685]
loss: 0.772844  [80200/80685]


100%|██████████| 12/12 [00:54<00:00,  4.56s/it]

Avg loss: 0.776100 






In [63]:
SUGGESTIONS_COUNT = 20

print("Movie Recommendations for me:")
new_user_id = movielens_train.ratings['userId'].max()
suggestions = suggest_movies(deep_mf_model, new_user_id, movielens_train.movies, suggestions_count=SUGGESTIONS_COUNT)
suggestions

Movie Recommendations for me:


Unnamed: 0,movieId,title,genres
3451,4708,Marat/Sade (1966),Drama|Musical
8717,126088,A Flintstones Christmas Carol (1994),Animation|Children|Comedy
8113,100843,Oh Boy (A Coffee in Berlin) (2012),Comedy|Drama
4292,6270,Akira Kurosawa's Dreams (Dreams) (1990),Drama|Fantasy
5205,8484,"Human Condition I, The (Ningen no joken I) (1959)",Drama|War
5286,8730,To End All Wars (2001),Action|Drama|War
6189,44943,9/11 (2002),Documentary
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War
7535,84844,Brother 2 (Brat 2) (2000),Crime|Drama
9146,147374,"Doctor Who: The Doctor, the Widow and the Ward...",Adventure|Drama
