In [37]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import mlflow
from tqdm import tqdm
from itertools import product
from collections import Counter

In [38]:
path = '../data/raw/ml-latest-small/'

movies = pd.read_csv(path + 'movies.csv')
ratings = pd.read_csv(path + 'ratings.csv')

ratings = ratings[ratings['rating'] >= 3.5]

user_ids = ratings['userId'].unique().tolist()
movie_ids = ratings['movieId'].unique().tolist()
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}
idx_to_movie = {idx: movie_id for movie_id, idx in movie_to_idx.items()}

# 장르 처리
MAX_GENRES = 5 # 영화당 최대 5개 장르
geners = set()

for genre_list in movies['genres']:
    geners.update(genre_list.split('|'))

# 0번 인덱스는 padding을 위한 인덱스
gener_to_idx = {genre: idx+1 for idx, genre in enumerate(sorted(geners))}

movie_genre_map = {}

for _, row in movies.iterrows():
    movie_id = row['movieId']
    genre_list = row['genres'].split('|')
    
    # 장르를 id로 변환
    genre_indices = [gener_to_idx.get(genre, 0) for genre in genre_list]
    
    # 장르가 MAX_GENRES보다 작으면 padding
    if len(genre_indices) < MAX_GENRES:
        genre_indices += [0] * (MAX_GENRES - len(genre_indices))
    else:
        genre_indices = genre_indices[:MAX_GENRES]
    
    movie_genre_map[movie_id] = genre_indices

# mean pooling을 위한 유저 장르 맵
user_genre_map = {}

merged = pd.merge(ratings, movies, on='movieId')

for user_id, group in merged.groupby('userId'):
    all_genres = [] # 한 사용자가 본 모든 장르 리스트
    for genre_str in group['genres']:
        all_genres.extend(genre_str.split('|'))

    if not all_genres:
        user_genre_map[user_id] = [0] * MAX_GENRES
        continue

    genre_indices = [gener_to_idx.get(genre, 0) for genre in all_genres]

    if len(genre_indices) < MAX_GENRES:
        genre_indices += [0] * (MAX_GENRES - len(genre_indices))
    else:
        genre_indices = genre_indices[:MAX_GENRES]
    
    user_genre_map[user_id] = genre_indices

ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)

num_users = len(user_ids)
num_items = len(movie_ids)
num_genres = len(geners) + 1 # 임베딩 레이어 크기 (0번을 포함)

train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

In [39]:
class MovieLensTwoTowerDataset(Dataset):
    def __init__(self, df, user_genre_map, movie_genre_map, movie_ids, num_negatives=4):
        self.users, self.items, self.labels, self.user_genres, self.item_genres = \
            self._get_dataset(df, user_genre_map, movie_genre_map, movie_ids, num_negatives)

    def _get_dataset(self, df, user_genre_map, movie_genre_map, movie_ids, num_negatives):
        movie_ids = np.array(movie_ids)
        # postivie 샘플링
        users_pos = df['user_idx'].values
        items_pos = df['movie_idx'].values
        labels_pos = np.ones(len(users_pos), dtype=np.float32)

        user_genres_pos = [user_genre_map.get(idx_to_user[user], [0]*5) for user in users_pos]
        item_genres_pos = [movie_genre_map.get(idx_to_movie[item], [0]*5) for item in items_pos]

        # negative 샘플링
        num_items = len(movie_ids)
        pos_set = set(zip(users_pos, items_pos))

        user_neg = np.repeat(users_pos, num_negatives)
        item_neg = np.random.randint(0, num_items, size=len(user_neg))
        labels_neg = np.zeros(len(user_neg), dtype=np.float32)

        # positive를 제외한 negative 샘플링
        for _ in range(5):
            mask = np.array([(u, i) in pos_set for u, i in zip(user_neg, item_neg)])
            if np.sum(mask) == 0:
                break
            item_neg[mask] = np.random.randint(0, num_items, size=np.sum(mask))

        user_genre_neg = [user_genre_map.get(idx_to_user[user], [0]*5) for user in user_neg]
        item_genre_neg = [movie_genre_map.get(idx_to_movie[item], [0]*5) for item in item_neg]

        users = np.concatenate([users_pos, user_neg])
        items = np.concatenate([items_pos, item_neg])
        labels = np.concatenate([labels_pos, labels_neg])
        user_genres = np.concatenate([user_genres_pos, user_genre_neg])
        item_genres = np.concatenate([item_genres_pos, item_genre_neg])

        return users, items, labels, user_genres, item_genres

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.items[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float),
            torch.tensor(self.user_genres[idx], dtype=torch.long),
            torch.tensor(self.item_genres[idx], dtype=torch.long)
        )


In [40]:
class MovieLensTwoTowerModel(nn.Module): 
    def __init__(self, num_users, num_items, num_genres, embedding_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim=embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim=embedding_dim)
        self.genre_emb = nn.Embedding(num_genres, embedding_dim=embedding_dim, padding_idx=0) # genre의 idx=0은 학습 x

        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        nn.init.normal_(self.genre_emb.weight, std=0.01)

    def forward(self, user_idx, item_idx, user_genre_idx, item_genre_idx):
        user_emb = self.user_emb(user_idx)
        item_emb = self.item_emb(item_idx)
        user_genre_emb = self.genre_emb(user_genre_idx)
        item_genre_emb = self.genre_emb(item_genre_idx)

        # mean pooling
        ## shape: (batch_size, embedding_dim) -> (batch_size, embedding_dim, 1)
        user_genre_mask = (user_genre_idx != 0).unsqueeze(-1).float()
        item_genre_mask = (item_genre_idx != 0).unsqueeze(-1).float()

        # padding(0) 제외한 평균
        user_genre_emb = user_genre_emb.sum(dim=1) / user_genre_mask.sum(dim=1).clamp(min=1)
        item_genre_emb = item_genre_emb.sum(dim=1) / item_genre_mask.sum(dim=1).clamp(min=1)

        # user_emb, item_emb와 genre embedding을 합침
        user_vec = user_emb + user_genre_emb
        item_vec = item_emb + item_genre_emb

        # 유사도 계산
        similarity = (user_vec * item_vec).sum(dim=1)
        
        return similarity

In [None]:

device = "cpu"
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")

param_grid = {
    "learning_rate": [0.01, 0.001],
    "num_epochs": [10, 20],
    "embedding_dim": [64, 128],
    "num_negatives": [4, 6, 10],
}

keys = param_grid.keys()
combinations = list(product(*param_grid.values()))

# 데이터셋, 데이터로더 정상 작동 테스트
train_dataset = MovieLensTwoTowerDataset(train_ratings, user_genre_map, movie_genre_map, movie_ids, num_negatives=6)
test_dataset = MovieLensTwoTowerDataset(test_ratings, user_genre_map, movie_genre_map, movie_ids, num_negatives=6)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

model = MovieLensTwoTowerModel(num_users, num_items, num_genres, embedding_dim=64)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 학습 정상 작동 테스트
for epoch in range(5):
    model.train()
    pbar = tqdm(train_dataloader, desc="Train")
    for user_batch, item_batch, label_batch, user_genre_batch, movie_genre_batch in pbar:
        user_batch, item_batch, label_batch, user_genre_batch, movie_genre_batch = user_batch.to(device), item_batch.to(device), label_batch.to(device), user_genre_batch.to(device), movie_genre_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(user_batch, item_batch, user_genre_batch, movie_genre_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()

Train: 100%|██████████| 2701/2701 [00:14<00:00, 181.26it/s]
Train: 100%|██████████| 2701/2701 [00:14<00:00, 182.02it/s]
Train: 100%|██████████| 2701/2701 [00:15<00:00, 178.58it/s]
Train: 100%|██████████| 2701/2701 [00:15<00:00, 177.72it/s]
Train: 100%|██████████| 2701/2701 [00:16<00:00, 166.38it/s]
