In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import mlflow
from tqdm import tqdm
from itertools import product
from collections import Counter

In [None]:
path = '../data/raw/ml-latest-small/'

movies = pd.read_csv(path + 'movies.csv')
ratings = pd.read_csv(path + 'ratings.csv')

ratings = ratings[ratings['rating'] >=   3.5]

user_ids = ratings['userId'].unique().tolist()
movie_ids = ratings['movieId'].unique().tolist()
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

# 장르 처리
MAX_GENRES = 5 # 영화당 최대 5개 장르
geners = set()

for genre_list in movies['genres']:
    geners.update(genre_list.split('|'))

# 0번 인덱스는 padding을 위한 인덱스
gener_to_idx = {genre: idx+1 for idx, genre in enumerate(sorted(geners))}

movie_genre_map = {}

for _, row in movies.iterrows():
    movie_id = row['movieId']
    genre_list = row['genres'].split('|')
    
    # 장르를 id로 변환
    genre_indices = [gener_to_idx.get(genre, 0) for genre in genre_list]
    
    # 장르가 MAX_GENRES보다 작으면 padding
    if len(genre_indices) < MAX_GENRES:
        genre_indices += [0] * (MAX_GENRES - len(genre_indices))
    else:
        genre_indices = genre_indices[:MAX_GENRES]
    
    movie_genre_map[movie_id] = genre_indices

# mean pooling을 위한 유저 장르 맵
user_genre_map = {}

merged = pd.merge(ratings, movies, on='movieId')

for user_id, group in merged.groupby('userId'):
    all_genres = [] # 한 사용자가 본 모든 장르 리스트
    for genre_str in group['genres']:
        all_genres.extend(genre_str.split('|'))

    if not all_genres:
        user_genre_map[user_id] = 0
        continue

    # user가 본 모든 장르를 담음
    user_genre_map[user_id] = [gener_to_idx.get(genre, 0) for genre in all_genres]

ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)

num_users = len(user_ids)
num_items = len(movie_ids)
num_genres = len(geners) + 1 # 임베딩 레이어 크기 (0번을 포함)

7363
0            0
1            1
2            2
3            3
4            4
          ... 
100830     696
100831    2124
100832    1259
100833    2125
100834     988
Name: movie_idx, Length: 61716, dtype: int64


In [None]:
class MovieLensTwoTowerDataset(Dataset):
    def __init__(self, df, user_genre_map, movie_genre_map, movie_ids, num_negatives=4):
        self.users, self.items, self.labels, self.user_genres, self.item_genres = \
            self._get_dataset(df, user_genre_map, movie_genre_map, movie_ids, num_negatives)

    def _get_dataset(self, df, user_genre_map, movie_genre_map, movie_ids, num_negatives):
        # postivie 샘플링
        users_pos = df['user_idx'].values
        items_pos = df['movie_idx'].values
        labels = np.ones(len(users_pos), dtype=np.float32)

        user_genres_pos = [user_genre_map.get(user, 0) for user in users_pos]
        item_genres_pos = [item_genre_map.get(item, 0) for item in items_pos]

        # negative 샘플링
        num_items = len(movie_ids)
        pos_set = set(zip(users_pos, items_pos))

        user_neg = np.repeat(users_pos, num_negatives)
        item_neg = movie_ids[np.random.randint(0, num_items, size=len(users_neg))]
        label_neg = np.zeros(len(user_neg), dtype=np.float32)

        # positive를 제외한 negative 샘플링
        for _ in range(5):
            mask = np.array([(u, i) in pos_set for u, i in zip(user_neg, item_neg)])
            if np.sum(mask) == 0:
                break
            item_neg[mask] = movie_ids[np.random.randint(0, num_items, size=np.sum(mask))]

        user_genre_neg = [user_genre_map[u] for user in user_neg]
        item_genre_neg = [item_genre_map.get(item, [0]*5) for item in item_neg]

        users = np.concatenate([users_pos, user_neg])
        items = np.concatenate([items_pos, item_neg])
        labels = np.concatenate([labels, label_neg])
        user_genres = np.concatenate([user_genres_pos, user_genre_neg])
        item_genres = np.concatenate([item_genres_pos, item_genre_neg])

        return users, items, labels, user_genres, item_genres

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.items[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float),
            torch.tensor(self.user_genres[idx], dtype=torch.long),
            torch.tensor(self.item_genres[idx], dtype=torch.long)
        )


In [5]:
# Negative Sampling 추가
positive_samples = ratings[['userId', 'movieId']].copy()
positive_samples['label'] = 1

# 모든 유저-아이템 조합에서 positive를 제외한 것이 negative
all_user_item_pairs = set(product(user_ids, movie_ids))
positive_pairs = set(zip(positive_samples['userId'], positive_samples['movieId']))
negative_pairs = all_user_item_pairs - positive_pairs

# Negative sampling (positive 개수만큼 샘플링)
negative_samples = pd.DataFrame(
    list(negative_pairs), 
    columns=['userId', 'movieId']
).sample(n=len(positive_samples), random_state=42)
negative_samples['label'] = 0

# 전체 데이터 합치기
df = pd.concat([positive_samples, negative_samples], ignore_index=True)

# ============================
# Train/Test Split (stratified)
# ============================
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # positive/negative 비율 유지
)

print(f'Train size: {len(train_df)}, Test size: {len(test_df)}')
print(f"Train label ratio: {train_df['label'].mean():.3f}")
print(f"Test label ratio: {test_df['label'].mean():.3f}")

# ============================
# Dataset & DataLoader 생성
# ============================
train_dataset = MovieLensTwoTowerDataset(train_df, user_genre_map, movie_genre_map)
test_dataset = MovieLensTwoTowerDataset(test_df, user_genre_map, movie_genre_map)

BATCH_SIZE = 256

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f'Train batches: {len(train_loader)}, Test batches: {len(test_loader)}')

Train size: 98745, Test size: 24687
Train label ratio: 0.500
Test label ratio: 0.500
Train batches: 386, Test batches: 97
