# B버전 (rating 활용: 이진 + 회귀 실험)
- label1: rating >= 4 → 1, else 0 (이진 분류)
- label2: 회귀로 rating 예측 후 threshold=4로 추천
- 네거티브: 평점<4만 음성으로 사용, 미관측 음성 미사용
- 스플릿: 유저별 8/1/1 랜덤
- 평가지표: AUC/F1/PR@thres (이진), RMSE/MAE + 분류메트릭(회귀→임계) 


In [1]:
# 환경 준비
import sys
from pathlib import Path
sys.path.append('..')  # 상위 디렉토리 import 허용

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, mean_squared_error, mean_absolute_error

from common import (
    load_interactions,
    encode_ids,
    split_userwise,
    make_binary_label,
)

print(torch.__version__)


2.9.0


## 설정

In [2]:
DATA_PATH = Path('../data/train.csv')
MODEL_DIR = Path('../codex_models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
THRESHOLDS = [4.0]  # 분류 임계값 후보

EMBED_DIM = 64
N_LAYERS = 2
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 5  # 필요 시 조정
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

rng = np.random.default_rng(SEED)


## 데이터 로드/인덱싱/라벨링

In [3]:
df_raw = load_interactions(DATA_PATH)
print(df_raw.head())

df_bin = make_binary_label(df_raw, threshold=4.0)

df, user2idx, item2idx = encode_ids(df_bin)
print(f"users={len(user2idx)}, items={len(item2idx)}, interactions={len(df)}")

train_df, val_df, test_df = split_userwise(df, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO, seed=SEED)
print(len(train_df), len(val_df), len(test_df))


   user  item  rating
0     1    16     4.0
1     1    24     1.5
2     1    32     4.0
3     1    47     4.0
4     1    50     4.0
users=668, items=10321, interactions=105139
83855 10226 11058


## 이진 분류용 데이터셋/모델

In [4]:
class PairDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.users = df['user_idx'].to_numpy()
        self.items = df['item_idx'].to_numpy()
        self.labels = df['label'].to_numpy().astype(np.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return int(self.users[idx]), int(self.items[idx]), float(self.labels[idx])


class MFClassifier(nn.Module):
    def __init__(self, num_users: int, num_items: int, embed_dim: int):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embed_dim)
        self.item_emb = nn.Embedding(num_items, embed_dim)
        nn.init.normal_(self.user_emb.weight, std=0.1)
        nn.init.normal_(self.item_emb.weight, std=0.1)

    def forward(self, u, i):
        u_e = self.user_emb(u)
        i_e = self.item_emb(i)
        logits = (u_e * i_e).sum(dim=1)
        return logits


## 이진 분류 학습/평가

In [5]:
train_loader = DataLoader(PairDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(PairDataset(val_df), batch_size=BATCH_SIZE)

def evaluate_binary(model, loader, device='cpu'):
    model.eval()
    ys = []
    ps = []
    with torch.no_grad():
        for u, i, y in loader:
            u = torch.tensor(u, device=device)
            i = torch.tensor(i, device=device)
            y = torch.tensor(y, device=device, dtype=torch.float32)
            logits = model(u, i)
            prob = torch.sigmoid(logits)
            ys.append(y.cpu().numpy())
            ps.append(prob.cpu().numpy())
    y_true = np.concatenate(ys)
    y_prob = np.concatenate(ps)
    metrics = {}
    metrics['auc'] = roc_auc_score(y_true, y_prob)
    for th in [0.5]:
        y_pred = (y_prob >= th).astype(int)
        metrics[f'f1@{th}'] = f1_score(y_true, y_pred)
        metrics[f'prec@{th}'] = precision_score(y_true, y_pred)
        metrics[f'recall@{th}'] = recall_score(y_true, y_pred)
    return metrics

model_bin = MFClassifier(len(user2idx), len(item2idx), EMBED_DIM).to(DEVICE)
optim_bin = torch.optim.Adam(model_bin.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    model_bin.train()
    total_loss = 0.0
    for u, i, y in train_loader:
        u = torch.tensor(u, device=DEVICE)
        i = torch.tensor(i, device=DEVICE)
        y = torch.tensor(y, device=DEVICE, dtype=torch.float32)
        logits = model_bin(u, i)
        loss = nn.functional.binary_cross_entropy_with_logits(logits, y)
        optim_bin.zero_grad()
        loss.backward()
        optim_bin.step()
        total_loss += loss.item() * u.shape[0]
    metrics = evaluate_binary(model_bin, val_loader, device=DEVICE)
    print(f"[BIN] epoch={epoch} loss={total_loss/len(train_loader.dataset):.4f} metrics={metrics}")


  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)
  u = torch.tensor(u, device=device)
  i = torch.tensor(i, device=device)
  y = torch.tensor(y, device=device)
  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)


[BIN] epoch=1 loss=0.6939 metrics={'auc': 0.5033773718058987, 'f1@0.5': 0.5039446771208727, 'prec@0.5': 0.5, 'recall@0.5': 0.5079520911054388}


  u = torch.tensor(u, device=device)
  i = torch.tensor(i, device=device)
  y = torch.tensor(y, device=device)
  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)


[BIN] epoch=2 loss=0.6849 metrics={'auc': 0.5063906985629343, 'f1@0.5': 0.504523786360541, 'prec@0.5': 0.5, 'recall@0.5': 0.5091301786766149}


  u = torch.tensor(u, device=device)
  i = torch.tensor(i, device=device)
  y = torch.tensor(y, device=device)
  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)


[BIN] epoch=3 loss=0.6745 metrics={'auc': 0.5138197689735005, 'f1@0.5': 0.5098995415975812, 'prec@0.5': 0.5065891472868217, 'recall@0.5': 0.5132534851757314}


  u = torch.tensor(u, device=device)
  i = torch.tensor(i, device=device)
  y = torch.tensor(y, device=device)
  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)


[BIN] epoch=4 loss=0.6604 metrics={'auc': 0.5277778957216923, 'f1@0.5': 0.5223053746719798, 'prec@0.5': 0.5171285604311009, 'recall@0.5': 0.5275868839583743}
[BIN] epoch=5 loss=0.6405 metrics={'auc': 0.5509159671030579, 'f1@0.5': 0.5385212497574229, 'prec@0.5': 0.5323230385574526, 'recall@0.5': 0.5448655016689574}


  u = torch.tensor(u, device=device)
  i = torch.tensor(i, device=device)
  y = torch.tensor(y, device=device)


## 회귀용 모델/평가

In [6]:
class MFRegressor(nn.Module):
    def __init__(self, num_users: int, num_items: int, embed_dim: int):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embed_dim)
        self.item_emb = nn.Embedding(num_items, embed_dim)
        self.bias = nn.Parameter(torch.zeros(1))
        nn.init.normal_(self.user_emb.weight, std=0.1)
        nn.init.normal_(self.item_emb.weight, std=0.1)

    def forward(self, u, i):
        u_e = self.user_emb(u)
        i_e = self.item_emb(i)
        pred = (u_e * i_e).sum(dim=1) + self.bias
        return pred


In [7]:
train_loader_reg = DataLoader(PairDataset(train_df.assign(label=train_df['rating'])), batch_size=BATCH_SIZE, shuffle=True)
val_loader_reg = DataLoader(PairDataset(val_df.assign(label=val_df['rating'])), batch_size=BATCH_SIZE)

def evaluate_reg(model, loader, device='cpu', threshold=4.0):
    model.eval()
    ys = []
    ps = []
    with torch.no_grad():
        for u, i, y in loader:
            u = torch.tensor(u, device=device)
            i = torch.tensor(i, device=device)
            y = torch.tensor(y, device=device, dtype=torch.float32)
            pred = model(u, i)
            ys.append(y.cpu().numpy())
            ps.append(pred.cpu().numpy())
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    y_bin = (y_true >= threshold).astype(int)
    y_bin_pred = (y_pred >= threshold).astype(int)
    f1 = f1_score(y_bin, y_bin_pred)
    return {'rmse': rmse, 'mae': mae, f'f1@{threshold}': f1}

model_reg = MFRegressor(len(user2idx), len(item2idx), EMBED_DIM).to(DEVICE)
optim_reg = torch.optim.Adam(model_reg.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    model_reg.train()
    total_loss = 0.0
    for u, i, y in train_loader_reg:
        u = torch.tensor(u, device=DEVICE)
        i = torch.tensor(i, device=DEVICE)
        y = torch.tensor(y, device=DEVICE, dtype=torch.float32)
        pred = model_reg(u, i)
        loss = nn.functional.mse_loss(pred, y)
        optim_reg.zero_grad()
        loss.backward()
        optim_reg.step()
        total_loss += loss.item() * u.shape[0]
    metrics = evaluate_reg(model_reg, val_loader_reg, device=DEVICE, threshold=4.0)
    print(f"[REG] epoch={epoch} loss={total_loss/len(train_loader_reg.dataset):.4f} metrics={metrics}")


  u = torch.tensor(u, device=DEVICE)
  i = torch.tensor(i, device=DEVICE)
  y = torch.tensor(y, device=DEVICE)


RuntimeError: Found dtype Double but expected Float

## 체크포인트 저장

In [None]:
torch.save({
    'model_state': model_bin.state_dict(),
    'num_users': len(user2idx),
    'num_items': len(item2idx),
    'embed_dim': EMBED_DIM,
    'type': 'binary'
}, MODEL_DIR / 'codexb1_binary.pth')

torch.save({
    'model_state': model_reg.state_dict(),
    'num_users': len(user2idx),
    'num_items': len(item2idx),
    'embed_dim': EMBED_DIM,
    'type': 'regression'
}, MODEL_DIR / 'codexb1_regression.pth')
print('saved checkpoints')


## 추론 유틸 (O/X)

In [None]:
def predict_ox_binary(model, user_enc, item_enc, csv_path: Path, threshold=0.5):
    df_in = pd.read_csv(csv_path)
    users = df_in['user'].map(user_enc)
    items = df_in['item'].map(item_enc)
    if users.isnull().any() or items.isnull().any():
        raise ValueError('미등록 user/item 존재')
    u = torch.tensor(users.to_numpy(), device=DEVICE)
    i = torch.tensor(items.to_numpy(), device=DEVICE)
    with torch.no_grad():
        prob = torch.sigmoid(model(u, i)).cpu().numpy()
    df_in['recommend'] = np.where(prob >= threshold, 'O', 'X')
    total_o = (df_in['recommend'] == 'O').sum()
    total = len(df_in)
    print(df_in)
    print(f"====================
Total recommends = {total_o}/{total}
Not recommend = {total - total_o}/{total}")
    return df_in


def predict_ox_reg(model, user_enc, item_enc, csv_path: Path, threshold=4.0):
    df_in = pd.read_csv(csv_path)
    users = df_in['user'].map(user_enc)
    items = df_in['item'].map(item_enc)
    if users.isnull().any() or items.isnull().any():
        raise ValueError('미등록 user/item 존재')
    u = torch.tensor(users.to_numpy(), device=DEVICE)
    i = torch.tensor(items.to_numpy(), device=DEVICE)
    with torch.no_grad():
        pred = model(u, i).cpu().numpy()
    df_in['recommend'] = np.where(pred >= threshold, 'O', 'X')
    total_o = (df_in['recommend'] == 'O').sum()
    total = len(df_in)
    print(df_in)
    print(f"====================
Total recommends = {total_o}/{total}
Not recommend = {total - total_o}/{total}")
    return df_in


## 추론 예시

In [None]:
# 학습 후 실행 예시
# predict_ox_binary(model_bin, user2idx, item2idx, Path('../data/sample1.csv'), threshold=0.5)
# predict_ox_reg(model_reg, user2idx, item2idx, Path('../data/sample1.csv'), threshold=4.0)
