In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import mlflow

In [20]:
# MovieLens 데이터 로드
path = "../data/raw/ml-latest-small/"
ratings = pd.read_csv(path + "ratings.csv")
movies = pd.read_csv(path + "movies.csv")

ratings = ratings[ratings['rating'] >=   3.5]

user_ids = ratings['userId'].unique().tolist()
movie_ids = ratings['movieId'].unique().tolist()
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)

num_users = len(user_ids)
num_items = len(movie_ids)

print(f"Number of users: {num_users}, Number of movies: {num_items}")

Number of users: 609, Number of movies: 7363


In [21]:
# NCF용 데이터셋 클래스
class MovieLensNCFDataset(Dataset):
    def __init__(self, ratings_df, num_items, num_negatives=3):
        self.users, self.items, self.labels = self._get_dataset_optimized(ratings_df, num_items, num_negatives)

    # 기본 구현 (교육용))    
    def _get_dataset(self, ratings_df, num_items, num_negatives):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings_df['user_idx'], ratings_df['item_idx']))

        for (u, i) in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1.0)

            for _ in range(num_negatives):
                neg_item = np.random.randint(num_items)
                while (u, neg_item) in user_item_set:
                    neg_item = np.random.randint(num_items)
                users.append(u)
                items.append(neg_item)
                labels.append(.0)

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)

    # 벡터화된 구현 (하지만 negative 샘플링이 완전하지 않음, 실제로 positive 샘플이 포함될 수 있음)
    def _get_dataset_vectorized(self, ratings_df, num_items, num_negatives):
        user_pos = ratings_df['user_idx'].values
        item_pos = ratings_df['movie_idx'].values
        label_pos= np.ones(len(user_pos), dtype=np.float32)

        user_neg = np.repeat(user_pos, num_negatives)
        item_neg = np.random.randint(0, num_items, size=len(user_neg))
        label_neg = np.zeros(len(user_neg), dtype=np.float32)

        users = np.concatenate([user_pos, user_neg])
        items = np.concatenate([item_pos, item_neg])
        labels = np.concatenate([label_pos, label_neg])

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)
    
    # 최적화된 구현 (negative 샘플링이 완전하지는 않을 확률이 극히 적음, 사실상 완전하다고 봐도 됨)
    def _get_dataset_optimized(self, ratings_df, num_items, num_negatives):
        user_pos = ratings_df['user_idx'].values
        item_pos = ratings_df['movie_idx'].values
        label_pos = np.ones(len(user_pos), dtype=np.float32)

        pos_hashed = user_pos * num_items + item_pos

        user_neg = np.repeat(user_pos, num_negatives)
        item_neg = np.random.randint(0, num_items, size=len(user_neg))
        label_neg = np.zeros(len(user_neg), dtype=np.float32)
        
        for _ in range(5):
            neg_hashed = user_neg * num_items + item_neg

            mask = np.isin(neg_hashed, pos_hashed)

            if np.sum(mask) == 0:
                break
            
            item_neg[mask] = np.random.randint(0, num_items, size=mask.sum())
        
        users = np.concatenate([user_pos, user_neg])
        items = np.concatenate([item_pos, item_neg])
        labels = np.concatenate([label_pos, label_neg])

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)
        
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

# 데이터셋 생성
train_dataset = MovieLensNCFDataset(ratings_df=ratings, num_items=num_items, num_negatives=6)
print(f"데이터셋 크기: {len(train_dataset)}")

데이터셋 크기: 432012


In [22]:
# 배치 사이즈
BATCH_SIZE = 256

# 데이터로더 생성
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

data_iter = iter(train_loader)
user_batch, item_batch, label_batch = next(data_iter)

print("User Batch Shape:", user_batch.shape)
print("Item Batch Shape:", item_batch.shape)
print("Label Batch Shape:", label_batch.shape)

User Batch Shape: torch.Size([256])
Item Batch Shape: torch.Size([256])
Label Batch Shape: torch.Size([256])
