# Лаб-3. Рекомендательные системы

In [7]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import random

# Выбираем девайс
USE_CUDA = False
device = "cuda" if USE_CUDA and torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cpu


In [8]:
OPTIMAL_TAGS_PER_PAIR_COUNT = 6
# Для загрузки датасета напишем свою реализацию класса Dataset
class MovielensDataset(Dataset):
    r"""seed должен быть одинаковым для обучающей и тренировочной выборки"""
    def __init__(self, source, train=True, seed=1, new_user_ratings=None, max_tags_per_pair=OPTIMAL_TAGS_PER_PAIR_COUNT):
        self.max_tags_per_pair = max_tags_per_pair
        ratings      = pd.read_csv(rf"{source}/ratings.csv")
        self.movies  = pd.read_csv(rf"{source}/movies.csv")
        self.tags = pd.read_csv(rf"{source}/tags.csv")
        
        title_basics = pd.read_csv(f"{source}/title.basics.filtered.csv", dtype={
            'imdbId': str,  # make sure imdbId is treated as a string
            'isAdult': 'int8',  # isAdult should be a binary column (0 or 1)
            'primaryTitle': str,
            'originalTitle': str,
            'startYear': str,
            'endYear': str,
            'runtimeMinutes': str,
            'genres': str
        })
        # Step 2: Parse links.csv to link movieId and imdbId
        links = pd.read_csv(f"{source}/links.csv", dtype={'movieId': 'int32', 'imdbId': str, 'tmdbId': str})
        
        # Merge links df with title_basics df on imdbId. This allows us to map movieId to the corresponding title_basics data
        movie_details = pd.merge(links, title_basics[['imdbId', 'isAdult']], on='imdbId', how='left')
        # merge isAdult column into self.movies    
        self.movies = pd.merge(self.movies, movie_details[['movieId', 'isAdult']], on='movieId', how='left')

    
        # Преобразовываем Id фильмов в индексы в таблице movies
        # x = self.movies.loc[:,['movieId']]
        # x['movieId'], x.index = x.index, x['movieId'].values
        # ratings['movieId'] = ratings['movieId'].map(x.to_dict()['movieId'])
        
        movie_id_map = pd.Series(self.movies.index, index=self.movies['movieId']).to_dict()
        ratings['movieId'] = ratings['movieId'].map(movie_id_map)
        
        self.tag_id_map = {
            tag: idx
            for idx, tag in enumerate(self.tags['tag'].unique())
        }
        self.tags['movieId'] = self.tags['movieId'].map(movie_id_map)
        self.tags['tagId'] = self.tags['tag'].map(self.tag_id_map)
        print(self.tags)
        
        if new_user_ratings:
            new_user_id = ratings['userId'].max() + 1
            new_ratings = pd.DataFrame([
                {
                    'userId': new_user_id,
                    'movieId': movie_idx,
                    'rating': rating
                } for movie_idx, rating in new_user_ratings
            ])
            ratings = pd.concat([ratings, new_ratings], ignore_index=True)

        # делим датасет 80% на 20%
        train_data = ratings.sample(frac=0.8, random_state=seed)
        test_data  = ratings.drop(train_data.index)

        self.ratings = train_data if train else test_data

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        sample = self.ratings.iloc[idx]
        user, movie = sample['userId'], sample['movieId']
        tag_ids = self.tags[(self.tags['userId'] == user) & (self.tags['movieId'] == movie)]['tagId'].tolist()
        
        # pad/truncate tag_ids to fixed size
        if len(tag_ids) < self.max_tags_per_pair:
            tag_ids += [0] * (self.max_tags_per_pair - len(tag_ids))
        else:
            tag_ids = tag_ids[:self.max_tags_per_pair] 
        
        return {
            "user": torch.LongTensor([user]),
            "movie": torch.LongTensor([movie]),
            "rating": torch.FloatTensor([sample['rating']]),
            "tags": torch.LongTensor(tag_ids)
        }

def generate_random_ratings(num_movies, num_ratings=20):
    random_movies = random.sample(range(num_movies), num_ratings)
    ratings = [(movie_idx, random.uniform(1, 5)) for movie_idx in random_movies]
    return ratings

def suggest_movies(model, user_id, movies_df, tags_tensor=None, suggestions_count=10):
    model.eval()
    with torch.no_grad():
        all_movie_ids = torch.arange(len(movies_df), dtype=torch.long).to(device)
        user_tensor = torch.LongTensor([user_id] * len(all_movie_ids)).to(device)
        
        input_tags_tensor = torch.zeros(len(movies_df), OPTIMAL_TAGS_PER_PAIR_COUNT, dtype=torch.long) if tags_tensor is None else tags_tensor # lol
        
        predictions = model({
            "user": user_tensor.unsqueeze(1),
            "movie": all_movie_ids.unsqueeze(1),
            "tags": input_tags_tensor.to(device),
        })
        predictions = predictions.squeeze(1)
        recommended_ids = predictions.argsort(descending=True)[:suggestions_count]
        return movies_df.iloc[recommended_ids.cpu().numpy()]

In [9]:
BATCH_SIZE = 200
DATASET_SOURCE = r'./data'
MOCK_RATINGS_COUNT = 20

mock_ratings = generate_random_ratings(MOCK_RATINGS_COUNT)
RATINGS = [
    (111, 5.0), # 111,Taxi Driver (1976),Crime|Drama|Thriller
    (55444, 4.5), # 55444,Control (2007),Drama
    (88129, 5.0), # 88129,Drive (2011),Crime|Drama|Film-Noir|Thriller
    (99114, 5.0), # 99114,Django Unchained (2012),Action|Drama|Western
    (27156, 4.5), # 27156,"Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)",Action|Animation|Drama|Fantasy|Sci-Fi
    (47423, 4.0), # 47423,Half Nelson (2006),Drama
    (4306, 5.0), # 4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance
    (8360, 5.0), # 8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Romance
    (53121, 5.0), # 53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy
    (541, 5.0), # 541,Blade Runner (1982),Action|Sci-Fi|Thriller
    (122886,2.0), # 122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
    (5444, 5.0), # 5444,Lilo & Stitch (2002),Adventure|Animation|Children|Sci-Fi
    (171749, 4.0), # 171749,Death Note: Desu nôto (2006–2007),(no genres listed)
    (47, 4.5), # 47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
    (1201, 5.0), # 1201,"Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)",Action|Adventure|Western
    (2951, 5.0), # 2951,"Fistful of Dollars, A (Per un pugno di dollari) (1964)",Action|Western
    (64614, 5.0), # 64614,Gran Torino (2008),Crime|Drama
    (72737, 5.0), # 72737,"Princess and the Frog, The (2009)",Animation|Children|Fantasy|Musical|Romance
    (101525, 3.5), # 101525,"Place Beyond the Pines, The (2012)",Crime|Drama
    (31658, 5.0), # 31658,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Fantasy|Romance
]

movielens_train = MovielensDataset(DATASET_SOURCE, train=True, new_user_ratings=mock_ratings, max_tags_per_pair=OPTIMAL_TAGS_PER_PAIR_COUNT)
movielens_test  = MovielensDataset(DATASET_SOURCE, train=False, max_tags_per_pair=OPTIMAL_TAGS_PER_PAIR_COUNT)

train_loader = DataLoader(movielens_train, BATCH_SIZE, True)
test_loader = DataLoader(movielens_test, BATCH_SIZE, True)

for batch in train_loader:
    for k, v in batch.items():
        print(k, v.shape)
    break

      userId  movieId               tag   timestamp  tagId
0          2     6801             funny  1445714994      0
1          2     6801   Highly quotable  1445714996      1
2          2     6801      will ferrell  1445714992      2
3          2     7697      Boxing story  1445715207      3
4          2     7697               MMA  1445715200      4
...      ...      ...               ...         ...    ...
3678     606     4925         for katie  1171234019   1584
3679     606     5062           austere  1173392334   1585
3680     610     2452            gun fu  1493843984   1586
3681     610     2452  heroic bloodshed  1493843978   1587
3682     610     9461  Heroic Bloodshed  1493844270   1588

[3683 rows x 5 columns]
      userId  movieId               tag   timestamp  tagId
0          2     6801             funny  1445714994      0
1          2     6801   Highly quotable  1445714996      1
2          2     6801      will ferrell  1445714992      2
3          2     7697      Boxi

In [10]:
# Функции для обучения из прошлой лабы, с учётом юзеров и айтемов

def train_iteration(model, data_loader, loss_function, optimizer):
    model.train()
    train_size = len(data_loader.dataset)
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(batch)
        loss = loss_function(pred, batch['rating'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 100 == 0:
            loss, current = loss.item(), (idx + 1) * BATCH_SIZE
            print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]")

def test(model, data_loader, loss_function):
    model.eval()
    num_batches = len(data_loader)
    loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            pred = model(batch)
            loss += loss_function(pred, batch['rating']).item()

    loss /= num_batches
    print(f"Avg loss: {loss:>8f} \n")


def train(epochs, model, loss_function, optimizer):
    for t in tqdm(range(epochs)):
        print(f"== Epoch {t + 1} ==")
        train_iteration(model, train_loader, loss_function, optimizer)
        test(model, test_loader, loss_function)


In [11]:
sum([32,32])

64

In [12]:
class DeepFM(nn.Module):
    def __init__(self, num_users=1000, num_movies=10000, num_tags=5000):
        super().__init__()
       
        self.embeddings_dim = [32, 32, 16]
        self.fm_dim = self.embeddings_dim[0]
        
        self.user_embeddings = nn.Embedding(num_users, self.embeddings_dim[0])
        self.movie_embeddings = nn.Embedding(num_movies, self.embeddings_dim[1])
        self.tag_embeddings = nn.Embedding(num_tags, self.embeddings_dim[2], padding_idx=0)

        self.deep_input_dim = sum(self.embeddings_dim)
        self.deep_linear_dim = 128
        self.deep_output_dim = 128
        
        self.flatten = nn.Flatten()
        self.deep_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.deep_input_dim, self.deep_linear_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(self.deep_linear_dim, self.deep_linear_dim),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(self.deep_linear_dim, self.deep_output_dim),
            nn.ReLU(),
            nn.Dropout(0.7),
        )

        self.final_layer = nn.Linear(self.deep_output_dim + self.fm_dim, 1)  # adjusted input size

    def forward(self, batch):
        movie_emb = self.flatten(self.user_embeddings(batch['user']))
        user_emb = self.flatten(self.movie_embeddings(batch['movie']))

        # Compute mean of tag embeddings while ignoring padding (0)
        tag_emb = self.tag_embeddings(batch['tags'])  # Shape: [batch_size, max_tags_per_pair, tag_embedding_dim]
        mask = (batch['tags'] != 0).float().unsqueeze(2)  # Shape: [batch_size, max_tags_per_pair, 1] (.unsqueeze(2) adds an extra dimension to make the mask compatible with tag_emb)
        tag_emb = (tag_emb * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)  # Avoid division by 0
        
        fm = movie_emb * user_emb

        deep = torch.cat([movie_emb, user_emb, tag_emb], 1)
        deep = self.deep_layers(deep)

        v = torch.cat([fm, deep], 1)
        v = self.final_layer(v)
        # делаем сигмоиду на выходе и масштабируем к оценкам от 0 до 5
        return torch.sigmoid(v) * 5

EPOCHS_COUNT = 1
LEARNING_RATE = 1e-3

deep_mf_model = DeepFM(
    num_users=movielens_train.ratings['userId'].max() + 1,
    num_movies=len(movielens_train.movies),
    num_tags=len(movielens_train.tag_id_map)
).to(device)

deep_mf_loss = nn.MSELoss()
deep_mf_optimizer = torch.optim.Adam(deep_mf_model.parameters(), lr=LEARNING_RATE)

train(EPOCHS_COUNT, deep_mf_model, deep_mf_loss, deep_mf_optimizer)

  0%|          | 0/1 [00:00<?, ?it/s]

== Epoch 1 ==


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


loss: 2.412887  [  200/80685]
loss: 1.277790  [20200/80685]


  0%|          | 0/1 [00:12<?, ?it/s]


KeyboardInterrupt: 

In [None]:
SUGGESTIONS_COUNT = 20

print("Movie Recommendations for me:")
new_user_id = movielens_train.ratings['userId'].max()
suggestions = suggest_movies(deep_mf_model, new_user_id, movielens_train.movies, suggestions_count=SUGGESTIONS_COUNT)
suggestions