# GNN 기반 영화 추천 시스템
# V11A_FAST: In-batch Negative Mining (MacBook M4 최적화)
# Step 2 - 빠른 breakthrough 실험

In [1]:
import os
import pickle
import random
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm

# 시각화 설정
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("=" * 60)
print("환경 설정")
print("=" * 60)
print(f"PyTorch 버전: {torch.__version__}")
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
print(f"MPS 사용 가능: {torch.backends.mps.is_available()}")
print("=" * 60)

환경 설정
PyTorch 버전: 2.9.0
CUDA 사용 가능: False
MPS 사용 가능: True


In [None]:
# 하이퍼파라미터 설정 (V11A_FAST - In-batch Negative Mining)
CONFIG = {
    # Filtering 기준
    'min_user_interactions': 30,
    'min_item_interactions': 10,
    'rating_threshold': None,
    
    # Split 비율
    'train_ratio': 0.70,
    'valid_ratio': 0.15,
    'test_ratio': 0.15,
    
    # 모델 파라미터
    'embedding_dim': 64,
    'n_layers': 2,
    
    # 학습 파라미터
    'learning_rate': 0.001,
    'weight_decay': 1e-4,
    'batch_size': 512,
    'epochs': 100,
    'patience': 20,
    'neg_ratio': 6,
    
    # InfoNCE 파라미터
    'temperature': 0.2,
    
    # 평가
    'top_k': 10,
    
    # 시스템
    'device': 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'),
    'seed': 42,
    
    # 경로
    'data_dir': '../data',
    'processed_dir': '../data/processed',
    'model_dir': '../models',
    'result_dir': '../results',
}

# 폴더 생성
for dir_path in [CONFIG['processed_dir'], CONFIG['model_dir'], CONFIG['result_dir']]:
    os.makedirs(dir_path, exist_ok=True)

# Random seed 고정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_seed(CONFIG['seed'])

print("=" * 60)
print("V11A_FAST 설정 완료! (In-batch Negative Mining)")
print("=" * 60)
print(f"Device: {CONFIG['device']}")
print("\n⚡ 핵심: Random → In-batch Negative (MacBook M4 최적화)")
print(f"  V9c: Random sampling")
print(f"  V11a_fast: In-batch only → V9c와 동일한 속도!")
print(f"\n전략:")
print(f"  - 같은 배치 내 다른 유저의 positive items 사용")
print(f"  - 모델 forward 불필요 → 매우 빠름")
print(f"\n예상: Recall@10 > 17% (V9c 속도로!)")
print("=" * 60)

## 데이터 로드 (V6 전처리 파일 사용)

In [3]:
# V6에서 전처리한 데이터 로드
print("=" * 60)
print("V6 전처리 데이터 로드")
print("=" * 60)

train_df = pd.read_csv(os.path.join(CONFIG['processed_dir'], 'train_split_v6.csv'))
valid_df = pd.read_csv(os.path.join(CONFIG['processed_dir'], 'valid_split_v6.csv'))
test_df = pd.read_csv(os.path.join(CONFIG['processed_dir'], 'test_split_v6.csv'))

with open(os.path.join(CONFIG['processed_dir'], 'id_mappings_v6.pkl'), 'rb') as f:
    mappings = pickle.load(f)

n_users = len(mappings['user_id_map'])
n_items = len(mappings['item_id_map'])

print(f"\n데이터 로드 완료!")
print(f"  Users: {n_users}")
print(f"  Items: {n_items}")
print(f"  Train: {len(train_df):,}")
print(f"  Valid: {len(valid_df):,}")
print(f"  Test:  {len(test_df):,}")
print("\n✅ V6와 동일한 데이터 사용 → 공정한 비교 가능")

V6 전처리 데이터 로드

데이터 로드 완료!
  Users: 529
  Items: 2283
  Train: 56,072
  Valid: 11,826
  Test:  12,572

✅ V6와 동일한 데이터 사용 → 공정한 비교 가능


## Graph 구성

In [4]:
# Graph 구성
print("=" * 60)
print("Graph 구성")
print("=" * 60)

def create_graph(df, n_users, n_items):
    """User-Item Bipartite Graph 생성"""
    user_ids = df['user_id'].values
    item_ids = df['item_id'].values + n_users
    
    edge_index = torch.tensor([
        np.concatenate([user_ids, item_ids]),
        np.concatenate([item_ids, user_ids])
    ], dtype=torch.long)
    
    print(f"Node 수: {n_users + n_items} (User: {n_users}, Item: {n_items})")
    print(f"Edge 수: {edge_index.shape[1]:,} (양방향)")
    
    return edge_index

print("\n[1] Train Graph (학습용)")
train_edge_index = create_graph(train_df, n_users, n_items)

print("\n[2] Train+Valid Graph (Test 평가용)")
train_valid_df = pd.concat([train_df, valid_df])
train_valid_edge_index = create_graph(train_valid_df, n_users, n_items)

print("\n✅ Graph 생성 완료!")

Graph 구성

[1] Train Graph (학습용)
Node 수: 2812 (User: 529, Item: 2283)
Edge 수: 112,144 (양방향)

[2] Train+Valid Graph (Test 평가용)
Node 수: 2812 (User: 529, Item: 2283)
Edge 수: 135,796 (양방향)

✅ Graph 생성 완료!


  edge_index = torch.tensor([


## 모델 정의

In [None]:
# ⭐ In-batch Negative Mining (FAST!)
def create_user_item_dict(df):
    """User별로 상호작용한 item 집합 생성"""
    user_items = defaultdict(set)
    for _, row in df.iterrows():
        user_items[row['user_id']].add(row['item_id'])
    return user_items

def inbatch_negative_sampling(batch_df, user_items_dict, n_items, neg_ratio=6):
    """In-batch negative mining - 모델 forward 없이 빠름!"""
    pos_users = batch_df['user_id'].values
    neg_users = []
    neg_items = []
    
    # 배치 내 모든 유저의 positive items 수집
    batch_user_items = defaultdict(list)
    for user_id in pos_users:
        batch_user_items[user_id] = list(user_items_dict[user_id])
    
    # 각 유저별로 in-batch negatives 생성
    for idx, user_id in enumerate(pos_users):
        user_pos_items = user_items_dict[user_id]
        
        # 같은 배치의 다른 유저들의 positive items 수집
        candidates = []
        for other_idx, other_user in enumerate(pos_users):
            if other_idx != idx:
                other_items = batch_user_items[other_user]
                candidates.extend([i for i in other_items if i not in user_pos_items])
        
        # neg_ratio개 샘플링
        if len(candidates) >= neg_ratio:
            sampled = np.random.choice(candidates, neg_ratio, replace=False)
        else:
            # 부족하면 random으로 채움
            sampled = list(candidates) if candidates else []
            while len(sampled) < neg_ratio:
                rand_item = random.randint(0, n_items - 1)
                if rand_item not in user_pos_items and rand_item not in sampled:
                    sampled.append(rand_item)
        
        for neg_item in sampled:
            neg_users.append(user_id)
            neg_items.append(neg_item)
    
    return np.array(neg_users), np.array(neg_items)

train_user_items = create_user_item_dict(train_df)
train_valid_user_items = create_user_item_dict(train_valid_df)

print("In-batch Negative Mining 함수 정의 완료!")

In [6]:
# LightGCN 모델 (V6와 동일)
class LightGCNConv(MessagePassing):
    """LightGCN Convolution Layer"""
    def __init__(self):
        super().__init__(aggr='add')
    
    def forward(self, x, edge_index):
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
        
        return self.propagate(edge_index, x=x, norm=norm)
    
    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

class LightGCN(nn.Module):
    """LightGCN for Recommendation"""
    def __init__(self, n_users, n_items, embedding_dim=64, n_layers=2):
        super().__init__()
        
        self.n_users = n_users
        self.n_items = n_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        
        nn.init.normal_(self.user_embedding.weight, std=0.1)
        nn.init.normal_(self.item_embedding.weight, std=0.1)
        
        self.convs = nn.ModuleList([LightGCNConv() for _ in range(n_layers)])
    
    def forward(self, edge_index):
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        all_emb = torch.cat([user_emb, item_emb], dim=0)
        
        embs = [all_emb]
        
        for conv in self.convs:
            all_emb = conv(all_emb, edge_index)
            embs.append(all_emb)
        
        final_emb = torch.stack(embs, dim=0).mean(dim=0)
        
        user_final = final_emb[:self.n_users]
        item_final = final_emb[self.n_users:]
        
        return user_final, item_final
    
    def predict(self, users, items, edge_index):
        user_emb, item_emb = self.forward(edge_index)
        user_emb = user_emb[users]
        item_emb = item_emb[items]
        scores = (user_emb * item_emb).sum(dim=1)
        return scores

print("LightGCN 모델 정의 완료!")

LightGCN 모델 정의 완료!


In [7]:
# ⭐ InfoNCE Loss (핵심 변경!)
def infonce_loss(pos_scores, neg_scores, neg_ratio=4, temperature=0.2):
    """
    InfoNCE (Contrastive Learning) Loss
    
    loss = -log( exp(pos/τ) / (exp(pos/τ) + sum(exp(neg_i/τ))) )
         = -log( exp(pos/τ) / sum(exp(all/τ)) )
         = -pos/τ + log(sum(exp(all/τ)))
    
    BPR과의 차이:
    - BPR: pairwise comparison (pos vs neg_1, pos vs neg_2, ...)
    - InfoNCE: multi-class classification (pos를 [pos, neg_1, ..., neg_k] 중에서 구분)
    
    장점:
    1. 모든 negative를 동시에 활용 → 더 강한 gradient
    2. Temperature로 hard negative에 집중 가능
    3. Score distribution이 더 diverse해짐 (collapse 방지)
    """
    batch_size = pos_scores.size(0)
    
    # Reshape: neg_scores를 (batch_size, neg_ratio)로
    neg_scores = neg_scores.view(batch_size, neg_ratio)
    
    # pos_scores를 (batch_size, 1)로 확장
    pos_scores = pos_scores.unsqueeze(1)
    
    # 모든 scores를 concatenate: (batch_size, 1 + neg_ratio)
    all_scores = torch.cat([pos_scores, neg_scores], dim=1)
    
    # Temperature scaling
    all_scores = all_scores / temperature
    
    # InfoNCE: -log(exp(pos) / sum(exp(all)))
    # = -log_softmax(all_scores)[:, 0]
    # positive는 항상 index 0
    log_prob = F.log_softmax(all_scores, dim=1)
    loss = -log_prob[:, 0].mean()
    
    return loss

print("InfoNCE Loss 함수 정의 완료!")
print("\n수식:")
print("  BPR:     loss = -log(σ(s_pos - s_neg)) for each pair")
print("  InfoNCE: loss = -log(exp(s_pos/τ) / Σexp(s_all/τ))")
print("\n차이: BPR은 독립적 pair, InfoNCE는 모든 negative 동시 고려")

InfoNCE Loss 함수 정의 완료!

수식:
  BPR:     loss = -log(σ(s_pos - s_neg)) for each pair
  InfoNCE: loss = -log(exp(s_pos/τ) / Σexp(s_all/τ))

차이: BPR은 독립적 pair, InfoNCE는 모든 negative 동시 고려


In [8]:
# 평가 함수 (V6와 동일)
def evaluate_model(model, edge_index, eval_df, user_items_dict, n_items, k=10, device='cpu'):
    """모델 평가: Precision@K, Recall@K, NDCG@K"""
    model.eval()
    
    with torch.no_grad():
        user_emb, item_emb = model(edge_index.to(device))
        
        precisions, recalls, ndcgs = [], [], []
        
        for user_id, group in eval_df.groupby('user_id'):
            true_items = set(group['item_id'].values)
            exclude_items = user_items_dict[user_id]
            
            user_emb_single = user_emb[user_id].unsqueeze(0)
            scores = torch.matmul(user_emb_single, item_emb.t()).squeeze()
            
            scores_np = scores.cpu().numpy()
            for item_id in exclude_items:
                scores_np[int(item_id)] = -np.inf
            
            top_k_items = np.argsort(scores_np)[-k:][::-1]
            
            hits = len(set(top_k_items) & true_items)
            
            precision = hits / k
            recall = hits / len(true_items) if len(true_items) > 0 else 0
            
            dcg = sum([1 / np.log2(i + 2) for i, item in enumerate(top_k_items) if item in true_items])
            idcg = sum([1 / np.log2(i + 2) for i in range(min(len(true_items), k))])
            ndcg = dcg / idcg if idcg > 0 else 0
            
            precisions.append(precision)
            recalls.append(recall)
            ndcgs.append(ndcg)
    
    return {
        f'precision@{k}': np.mean(precisions),
        f'recall@{k}': np.mean(recalls),
        f'ndcg@{k}': np.mean(ndcgs),
    }

print("평가 함수 정의 완료!")

평가 함수 정의 완료!


In [None]:
# Training 함수 (In-batch Negative 적용)
def train_one_epoch(model, edge_index, train_df, user_items_dict, n_items, 
                    optimizer, batch_size, neg_ratio, temperature, device):
    """1 epoch training with InfoNCE + In-batch Negative"""
    model.train()
    
    train_df_shuffled = train_df.sample(frac=1).reset_index(drop=True)
    
    total_loss = 0
    n_batches = 0
    
    for start_idx in range(0, len(train_df_shuffled), batch_size):
        end_idx = min(start_idx + batch_size, len(train_df_shuffled))
        batch_df = train_df_shuffled.iloc[start_idx:end_idx]
        
        pos_users = torch.tensor(batch_df['user_id'].values, dtype=torch.long).to(device)
        pos_items = torch.tensor(batch_df['item_id'].values, dtype=torch.long).to(device)
        
        # ⭐ In-batch Negative 사용!
        neg_users_np, neg_items_np = inbatch_negative_sampling(
            batch_df, user_items_dict, n_items, neg_ratio
        )
        neg_users = torch.tensor(neg_users_np, dtype=torch.long).to(device)
        neg_items = torch.tensor(neg_items_np, dtype=torch.long).to(device)
        
        pos_scores = model.predict(pos_users, pos_items, edge_index.to(device))
        neg_scores = model.predict(neg_users, neg_items, edge_index.to(device))
        
        loss = infonce_loss(pos_scores, neg_scores, neg_ratio, temperature)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    return total_loss / n_batches

print("Training 함수 정의 완료! (InfoNCE + In-batch Negative)")

## Training

In [None]:
# 메인 Training Loop (V11A_FAST)
model = LightGCN(
    n_users=n_users,
    n_items=n_items,
    embedding_dim=CONFIG['embedding_dim'],
    n_layers=CONFIG['n_layers']
).to(CONFIG['device'])

optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay']
)

history = {
    'train_loss': [],
    'valid_precision': [],
    'valid_recall': [],
    'valid_ndcg': [],
}

best_recall = 0
patience_counter = 0

print("=" * 60)
print("Training 시작 (V11A_FAST - In-batch Negative)")
print("=" * 60)
print(f"Model: LightGCN")
print(f"  - Users: {n_users}, Items: {n_items}")
print(f"  - Embedding: {CONFIG['embedding_dim']}, Layers: {CONFIG['n_layers']}")
total_params = sum(p.numel() for p in model.parameters())
print(f"  - Total params: {total_params:,}")
print(f"\n⚡ In-batch Negative (V9c 속도!)")
print(f"Device: {CONFIG['device']}")
print("=" * 60)

for epoch in range(CONFIG['epochs']):
    train_loss = train_one_epoch(
        model, train_edge_index, train_df, train_user_items,
        n_items, optimizer, CONFIG['batch_size'], 
        CONFIG['neg_ratio'], CONFIG['temperature'], CONFIG['device']
    )
    
    val_metrics = evaluate_model(
        model, train_edge_index, valid_df, train_user_items,
        n_items, k=CONFIG['top_k'], device=CONFIG['device']
    )
    
    history['train_loss'].append(train_loss)
    history['valid_precision'].append(val_metrics[f'precision@{CONFIG["top_k"]}'])
    history['valid_recall'].append(val_metrics[f'recall@{CONFIG["top_k"]}'])
    history['valid_ndcg'].append(val_metrics[f'ndcg@{CONFIG["top_k"]}'])
    
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch+1:3d}/{CONFIG['epochs']} | "
              f"Loss: {train_loss:.4f} | "
              f"R@{CONFIG['top_k']}: {val_metrics[f'recall@{CONFIG["top_k"]}']:.4f}")
    
    current_recall = val_metrics[f'recall@{CONFIG["top_k"]}']
    if current_recall > best_recall:
        best_recall = current_recall
        patience_counter = 0
        torch.save(model.state_dict(), 
                   os.path.join(CONFIG['model_dir'], 'lightgcn_v11a_fast_best.pth'))
    else:
        patience_counter += 1
    
    if patience_counter >= CONFIG['patience']:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

print("=" * 60)
print(f"Training 완료! Best Recall@{CONFIG['top_k']}: {best_recall:.4f}")
print("=" * 60)

In [None]:
# Training 결과 시각화
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(history['train_loss'], label='Train Loss', linewidth=2, color='#e74c3c')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('V11A_FAST Training Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history['valid_recall'], label=f'Recall@{CONFIG["top_k"]}', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Score')
axes[1].set_title('V11A_FAST Validation Recall')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(CONFIG['result_dir'], 'training_curves_v11a_fast.png'), dpi=300, bbox_inches='tight')
plt.show()

## Test Set 평가

In [None]:
# Test Set 평가
model.load_state_dict(torch.load(os.path.join(CONFIG['model_dir'], 'lightgcn_v11a_fast_best.pth')))

print("=" * 60)
print("Test Set 평가 (V11A_FAST)")
print("=" * 60)

v11a_fast_results = {}

for k in [5, 10, 20]:
    test_metrics = evaluate_model(
        model, 
        train_valid_edge_index,
        test_df, 
        train_valid_user_items,
        n_items, 
        k=k, 
        device=CONFIG['device']
    )
    
    v11a_fast_results[k] = test_metrics
    
    print(f"\nTop-{k} 추천:")
    print(f"  Precision@{k}: {test_metrics[f'precision@{k}']:.4f}")
    print(f"  Recall@{k}:    {test_metrics[f'recall@{k}']:.4f}")
    print(f"  NDCG@{k}:      {test_metrics[f'ndcg@{k}']:.4f}")

print("=" * 60)

## V8a vs V6 비교

In [None]:
print("=" * 60)
print("V11A_FAST vs V9C 결과 비교")
print("=" * 60)

# V9c 결과
v9c_baseline = {
    10: {'precision@10': 0.2756, 'recall@10': 0.1578, 'ndcg@10': 0.3129}
}

print("\n핵심 변경:")
print(f"  V9c: Random sampling")
print(f"  V11a_fast: In-batch negative")

print("\nTest Recall@10:")
print(f"  V9c:        {v9c_baseline[10]['recall@10']:.4f} (15.78%)")
print(f"  V11a_fast:  {v11a_fast_results[10]['recall@10']:.4f} ({v11a_fast_results[10]['recall@10']*100:.2f}%)")

improvement = (v11a_fast_results[10]['recall@10'] - v9c_baseline[10]['recall@10']) / v9c_baseline[10]['recall@10'] * 100
print(f"  변화: {improvement:+.1f}%")

print("\n전체 지표:")
for metric in ['precision@10', 'recall@10', 'ndcg@10']:
    v9c_val = v9c_baseline[10][metric]
    v11a_val = v11a_fast_results[10][metric]
    change = (v11a_val - v9c_val) / v9c_val * 100
    symbol = '✅' if change > 0 else '❌'
    print(f"  {metric:15s}: {v9c_val:.4f} → {v11a_val:.4f} ({change:+.1f}%) {symbol}")

print("=" * 60)
if improvement > 3:
    print("✅ 성공! In-batch가 효과적!")
elif improvement > 0:
    print("✅ 소폭 개선")
else:
    print("➖ 비슷한 수준")
print("=" * 60)