In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import mlflow
from tqdm import tqdm
from itertools import product

In [2]:
# MovieLens 데이터 로드
path = "../data/raw/ml-latest-small/"
ratings = pd.read_csv(path + "ratings.csv")
movies = pd.read_csv(path + "movies.csv")

ratings = ratings[ratings['rating'] >=   3.5]

user_ids = ratings['userId'].unique().tolist()
movie_ids = ratings['movieId'].unique().tolist()
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)

num_users = len(user_ids)
num_items = len(movie_ids)

print(f"Number of users: {num_users}, Number of movies: {num_items}")

Number of users: 609, Number of movies: 7363


In [3]:
# NCF용 데이터셋 클래스
class MovieLensNCFDataset(Dataset):
    def __init__(self, ratings_df, num_items, num_negatives=3):
        self.users, self.items, self.labels = self._get_dataset_optimized(ratings_df, num_items, num_negatives)

    # 기본 구현 (교육용))    
    def _get_dataset(self, ratings_df, num_items, num_negatives):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings_df['user_idx'], ratings_df['item_idx']))

        for (u, i) in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1.0)

            for _ in range(num_negatives):
                neg_item = np.random.randint(num_items)
                while (u, neg_item) in user_item_set:
                    neg_item = np.random.randint(num_items)
                users.append(u)
                items.append(neg_item)
                labels.append(.0)

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)

    # 벡터화된 구현 (하지만 negative 샘플링이 완전하지 않음, 실제로 positive 샘플이 포함될 수 있음)
    def _get_dataset_vectorized(self, ratings_df, num_items, num_negatives):
        user_pos = ratings_df['user_idx'].values
        item_pos = ratings_df['movie_idx'].values
        label_pos= np.ones(len(user_pos), dtype=np.float32)

        user_neg = np.repeat(user_pos, num_negatives)
        item_neg = np.random.randint(0, num_items, size=len(user_neg))
        label_neg = np.zeros(len(user_neg), dtype=np.float32)

        users = np.concatenate([user_pos, user_neg])
        items = np.concatenate([item_pos, item_neg])
        labels = np.concatenate([label_pos, label_neg])

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)
    
    # 최적화된 구현 (negative 샘플링이 완전하지는 않을 확률이 극히 적음, 사실상 완전하다고 봐도 됨)
    def _get_dataset_optimized(self, ratings_df, num_items, num_negatives):
        user_pos = ratings_df['user_idx'].values
        item_pos = ratings_df['movie_idx'].values
        label_pos = np.ones(len(user_pos), dtype=np.float32)

        pos_hashed = user_pos * num_items + item_pos

        user_neg = np.repeat(user_pos, num_negatives)
        item_neg = np.random.randint(0, num_items, size=len(user_neg))
        label_neg = np.zeros(len(user_neg), dtype=np.float32)
        
        for _ in range(5):
            neg_hashed = user_neg * num_items + item_neg

            mask = np.isin(neg_hashed, pos_hashed)

            if np.sum(mask) == 0:
                break
            
            item_neg[mask] = np.random.randint(0, num_items, size=mask.sum())
        
        users = np.concatenate([user_pos, user_neg])
        items = np.concatenate([item_pos, item_neg])
        labels = np.concatenate([label_pos, label_neg])

        return torch.tensor(users, dtype=torch.long), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)
        
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

# 데이터셋 생성
train_dataset = MovieLensNCFDataset(ratings_df=ratings, num_items=num_items, num_negatives=6)
print(f"데이터셋 크기: {len(train_dataset)}")

데이터셋 크기: 432012


In [4]:
# 배치 사이즈
BATCH_SIZE = 256

# 데이터로더 생성
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

data_iter = iter(train_loader)
user_batch, item_batch, label_batch = next(data_iter)


print("User Batch Shape:", user_batch.shape)
print("Item Batch Shape:", item_batch.shape)
print("Label Batch Shape:", label_batch.shape)

User Batch Shape: torch.Size([256])
Item Batch Shape: torch.Size([256])
Label Batch Shape: torch.Size([256])


In [5]:
# NCF Two Tower Model
class NCFModel(nn.Module):
    def __init__(self, num_users, num_items, latent_dim=32):
        super().__init__()

        # 임베딩 레이어 (Lookup Table)
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)

    def forward(self, user_indices, item_indices):
        user_vec = self.user_embedding(user_indices)
        item_vec = self.item_embedding(item_indices)

        # 배치 차원에서 두 벡터간의 내적 계산
        dot_product = (user_vec * item_vec).sum(dim=1)

        return dot_product

In [7]:
# cpu, cuda, mps 중 사용 가능한 디바이스 선택
device = "cpu"
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")

# 하이퍼파라미터 설정
param_grid = {
    "epochs": [5],
    "lr": [0.001, 0.0001],
    "batch_size": [512, 1024],
    "latent_dim": [32, 64, 128],
    "num_negatives": [4]
}

keys = param_grid.keys()
# 모든 하이퍼파라미터 조합 생성
combinations = list(product(*param_grid.values()))

mlflow.set_experiment("NCF_MovieLens_Recsys")

for i, values in enumerate(combinations):
    params = dict(zip(keys, values))
    run_name = f"NCF_lr{params['lr']}_bs{params['batch_size']}_dim{params['latent_dim']}"
    print(f"\n[{i+1}/{len(combinations)}] {run_name} 시작!!!")
    
    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)

    model = NCFModel(num_users=num_users, num_items=num_items, latent_dim=params["latent_dim"])
    model = model.to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=params["lr"])


    with mlflow.start_run(run_name="Basic_Two_Tower_NCF"):
        mlflow.log_params(params)
        model.train()
        best_loss = float('inf')

        for epoch in range(params["epochs"]):
            epoch_loss = 0.0
            pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{params['epochs']}")
            for user_batch, item_batch, label_batch in pbar:
                
                # 배치를 GPU로 이동
                user_batch = user_batch.to(device)
                item_batch = item_batch.to(device)
                label_batch = label_batch.to(device)
                
                # 순전파
                outputs = model(user_batch, item_batch) # 예측 값 계산
                loss = criterion(outputs, label_batch) # 손실 계산 
                
                # 역전파
                optimizer.zero_grad() # 기울기 초기화
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                pbar.set_postfix({"loss": loss.item()})

            avg_loss = epoch_loss / len(train_loader)
            mlflow.log_metric("train_loss", avg_loss, step=epoch)

            if avg_loss < best_loss:
                best_loss = avg_loss

        mlflow.log_metric("best_loss", best_loss)
        mlflow.pytorch.log_model(model, "model")
        print(f"Best loss: {best_loss:.4f}")
        
print("모든 실험 완료")


[1/12] NCF_lr0.001_bs512_dim32 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:04<00:00, 173.50it/s, loss=0.338]
Epoch 2/5: 100%|██████████| 844/844 [00:04<00:00, 180.40it/s, loss=0.277]
Epoch 3/5: 100%|██████████| 844/844 [00:04<00:00, 184.59it/s, loss=0.261]
Epoch 4/5: 100%|██████████| 844/844 [00:04<00:00, 180.66it/s, loss=0.251]
Epoch 5/5: 100%|██████████| 844/844 [00:04<00:00, 179.88it/s, loss=0.261]


Best loss: 0.2706

[2/12] NCF_lr0.001_bs512_dim64 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:05<00:00, 153.19it/s, loss=0.344]
Epoch 2/5: 100%|██████████| 844/844 [00:05<00:00, 160.94it/s, loss=0.297]
Epoch 3/5: 100%|██████████| 844/844 [00:05<00:00, 153.93it/s, loss=0.267]
Epoch 4/5: 100%|██████████| 844/844 [00:05<00:00, 154.77it/s, loss=0.285]
Epoch 5/5: 100%|██████████| 844/844 [00:05<00:00, 161.59it/s, loss=0.266]


Best loss: 0.2456

[3/12] NCF_lr0.001_bs512_dim128 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:06<00:00, 133.59it/s, loss=0.347]
Epoch 2/5: 100%|██████████| 844/844 [00:06<00:00, 133.09it/s, loss=0.235]
Epoch 3/5: 100%|██████████| 844/844 [00:06<00:00, 131.89it/s, loss=0.253]
Epoch 4/5: 100%|██████████| 844/844 [00:06<00:00, 134.79it/s, loss=0.225]
Epoch 5/5: 100%|██████████| 844/844 [00:06<00:00, 133.16it/s, loss=0.168]


Best loss: 0.1986

[4/12] NCF_lr0.001_bs1024_dim32 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:03<00:00, 113.74it/s, loss=0.357]
Epoch 2/5: 100%|██████████| 422/422 [00:03<00:00, 114.89it/s, loss=0.264]
Epoch 3/5: 100%|██████████| 422/422 [00:03<00:00, 112.65it/s, loss=0.287]
Epoch 4/5: 100%|██████████| 422/422 [00:03<00:00, 112.54it/s, loss=0.306]
Epoch 5/5: 100%|██████████| 422/422 [00:03<00:00, 119.81it/s, loss=0.271]


Best loss: 0.2740

[5/12] NCF_lr0.001_bs1024_dim64 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:03<00:00, 111.63it/s, loss=0.337]
Epoch 2/5: 100%|██████████| 422/422 [00:03<00:00, 113.72it/s, loss=0.32] 
Epoch 3/5: 100%|██████████| 422/422 [00:03<00:00, 114.06it/s, loss=0.313]
Epoch 4/5: 100%|██████████| 422/422 [00:03<00:00, 116.60it/s, loss=0.299]
Epoch 5/5: 100%|██████████| 422/422 [00:03<00:00, 113.69it/s, loss=0.281]


Best loss: 0.2628

[6/12] NCF_lr0.001_bs1024_dim128 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:04<00:00, 99.45it/s, loss=0.277] 
Epoch 2/5: 100%|██████████| 422/422 [00:04<00:00, 99.76it/s, loss=0.301] 
Epoch 3/5: 100%|██████████| 422/422 [00:04<00:00, 102.95it/s, loss=0.293]
Epoch 4/5: 100%|██████████| 422/422 [00:04<00:00, 99.39it/s, loss=0.248] 
Epoch 5/5: 100%|██████████| 422/422 [00:04<00:00, 99.05it/s, loss=0.236] 


Best loss: 0.2390

[7/12] NCF_lr0.0001_bs512_dim32 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:04<00:00, 170.18it/s, loss=0.692]
Epoch 2/5: 100%|██████████| 844/844 [00:04<00:00, 176.55it/s, loss=0.656]
Epoch 3/5: 100%|██████████| 844/844 [00:04<00:00, 182.67it/s, loss=0.576]
Epoch 4/5: 100%|██████████| 844/844 [00:04<00:00, 174.45it/s, loss=0.484]
Epoch 5/5: 100%|██████████| 844/844 [00:04<00:00, 178.26it/s, loss=0.407]


Best loss: 0.4608

[8/12] NCF_lr0.0001_bs512_dim64 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:05<00:00, 152.19it/s, loss=0.69] 
Epoch 2/5: 100%|██████████| 844/844 [00:05<00:00, 146.38it/s, loss=0.624]
Epoch 3/5: 100%|██████████| 844/844 [00:06<00:00, 140.10it/s, loss=0.494]
Epoch 4/5: 100%|██████████| 844/844 [00:05<00:00, 148.71it/s, loss=0.406]
Epoch 5/5: 100%|██████████| 844/844 [00:05<00:00, 145.72it/s, loss=0.348]


Best loss: 0.3773

[9/12] NCF_lr0.0001_bs512_dim128 시작!!!


Epoch 1/5: 100%|██████████| 844/844 [00:06<00:00, 136.63it/s, loss=0.689]
Epoch 2/5: 100%|██████████| 844/844 [00:06<00:00, 134.02it/s, loss=0.578]
Epoch 3/5: 100%|██████████| 844/844 [00:06<00:00, 133.86it/s, loss=0.401]
Epoch 4/5: 100%|██████████| 844/844 [00:06<00:00, 138.33it/s, loss=0.357]
Epoch 5/5: 100%|██████████| 844/844 [00:06<00:00, 129.58it/s, loss=0.316]


Best loss: 0.3230

[10/12] NCF_lr0.0001_bs1024_dim32 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:03<00:00, 121.09it/s, loss=0.693]
Epoch 2/5: 100%|██████████| 422/422 [00:03<00:00, 120.01it/s, loss=0.686]
Epoch 3/5: 100%|██████████| 422/422 [00:03<00:00, 117.12it/s, loss=0.653]
Epoch 4/5: 100%|██████████| 422/422 [00:03<00:00, 121.23it/s, loss=0.601]
Epoch 5/5: 100%|██████████| 422/422 [00:03<00:00, 115.77it/s, loss=0.538]


Best loss: 0.5737

[11/12] NCF_lr0.0001_bs1024_dim64 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:03<00:00, 107.23it/s, loss=0.693]
Epoch 2/5: 100%|██████████| 422/422 [00:04<00:00, 102.62it/s, loss=0.679]
Epoch 3/5: 100%|██████████| 422/422 [00:04<00:00, 104.78it/s, loss=0.618]
Epoch 4/5: 100%|██████████| 422/422 [00:03<00:00, 107.39it/s, loss=0.536]
Epoch 5/5: 100%|██████████| 422/422 [00:03<00:00, 116.29it/s, loss=0.452]


Best loss: 0.4924

[12/12] NCF_lr0.0001_bs1024_dim128 시작!!!


Epoch 1/5: 100%|██████████| 422/422 [00:04<00:00, 99.43it/s, loss=0.692] 
Epoch 2/5: 100%|██████████| 422/422 [00:04<00:00, 98.75it/s, loss=0.664] 
Epoch 3/5: 100%|██████████| 422/422 [00:04<00:00, 101.19it/s, loss=0.557]
Epoch 4/5: 100%|██████████| 422/422 [00:04<00:00, 102.31it/s, loss=0.454]
Epoch 5/5: 100%|██████████| 422/422 [00:04<00:00, 99.15it/s, loss=0.37]  


Best loss: 0.4032
모든 실험 완료
