# B버전 codexb2 (rating 활용: 이진 + 회귀)
- 이진(label=rating>=4, 미관측 neg 미사용) + 회귀(pred->thres=4)
- early stopping + 임계 최적화 + 규칙기반 평가 + 시각화

In [None]:

import sys
from pathlib import Path
sys.path.append('..')

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

DATA_PATH = Path('../data/train.csv')
MODEL_DIR = Path('../codex_models'); MODEL_DIR.mkdir(parents=True, exist_ok=True)
SEED=42
TRAIN_RATIO=0.8; VAL_RATIO=0.1
EMBED_DIM=64
HIDDEN=[64]
LR=1e-3
WEIGHT_DECAY=1e-5
BATCH_SIZE=1024
EPOCHS_BIN=20; EPOCHS_REG=30
PATIENCE=5
DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(SEED); torch.manual_seed(SEED)


In [None]:

def load_interactions(path):
    return pd.read_csv(path)

def encode_ids(df):
    users = sorted(df['user'].unique()); items = sorted(df['item'].unique())
    u2i={u:i for i,u in enumerate(users)}; it2i={v:i for i,v in enumerate(items)}
    out=df.copy(); out['user_idx']=out['user'].map(u2i); out['item_idx']=out['item'].map(it2i)
    return out, u2i, it2i

def split_userwise(df, train_ratio=0.8, val_ratio=0.1, seed=42):
    rng=np.random.default_rng(seed); trains=[]; vals=[]; tests=[]
    for _, g in df.groupby('user_idx'):
        idx=rng.permutation(len(g)); g=g.iloc[idx]; n=len(g); n_train=int(n*train_ratio); n_val=int(n*val_ratio)
        trains.append(g.iloc[:n_train]); vals.append(g.iloc[n_train:n_train+n_val]); tests.append(g.iloc[n_train+n_val:])
    return pd.concat(trains, ignore_index=True), pd.concat(vals, ignore_index=True), pd.concat(tests, ignore_index=True)

def rule_k(count):
    if count<=10: return max(2,1)
    return max(int(np.floor(0.2*count)),1)


In [None]:

class PairDataset(Dataset):
    def __init__(self, df, label_col='label'):
        self.users=df['user_idx'].to_numpy(); self.items=df['item_idx'].to_numpy(); self.labels=df[label_col].to_numpy().astype(np.float32)
    def __len__(self): return len(self.users)
    def __getitem__(self, idx):
        return int(self.users[idx]), int(self.items[idx]), float(self.labels[idx])

class MFClassifier(nn.Module):
    def __init__(self, num_users, num_items, embed_dim):
        super().__init__(); self.u=nn.Embedding(num_users, embed_dim); self.i=nn.Embedding(num_items, embed_dim); nn.init.normal_(self.u.weight, std=0.1); nn.init.normal_(self.i.weight, std=0.1)
    def forward(self, u, it):
        return (self.u(u)*self.i(it)).sum(dim=1)

class MFRegressor(nn.Module):
    def __init__(self, num_users, num_items, embed_dim):
        super().__init__(); self.u=nn.Embedding(num_users, embed_dim); self.i=nn.Embedding(num_items, embed_dim); self.bias=nn.Parameter(torch.zeros(1)); nn.init.normal_(self.u.weight, std=0.1); nn.init.normal_(self.i.weight, std=0.1)
    def forward(self, u, it):
        return (self.u(u)*self.i(it)).sum(dim=1)+self.bias


In [None]:

def eval_binary(model, loader, device='cpu', thresholds=[0.5]):
    model.eval(); ys=[]; ps=[]
    with torch.no_grad():
        for u,i,y in loader:
            u=torch.tensor(u, device=device); i=torch.tensor(i, device=device)
            prob=torch.sigmoid(model(u,i)).cpu().numpy(); ys.append(np.array(y)); ps.append(prob)
    y_true=np.concatenate(ys); y_prob=np.concatenate(ps); metrics={'auc': roc_auc_score(y_true, y_prob)}
    for th in thresholds:
        y_pred=(y_prob>=th).astype(int)
        metrics[f'f1@{th}']=f1_score(y_true, y_pred)
        metrics[f'prec@{th}']=precision_score(y_true, y_pred)
        metrics[f'recall@{th}']=recall_score(y_true, y_pred)
    return metrics, y_true, y_prob

def eval_reg(model, loader, device='cpu', threshold=4.0, clip_range=(0.5,5.0)):
    model.eval(); ys=[]; ps=[]
    with torch.no_grad():
        for u,i,y in loader:
            u=torch.tensor(u, device=device); i=torch.tensor(i, device=device)
            pred=model(u,i).cpu().numpy(); ys.append(np.array(y)); ps.append(pred)
    y_true=np.concatenate(ys); y_pred=np.concatenate(ps)
    if clip_range:
        lo,hi=clip_range; y_pred=np.clip(y_pred, lo, hi)
    rmse=np.sqrt(mean_squared_error(y_true, y_pred)); mae=mean_absolute_error(y_true, y_pred)
    y_bin=(y_true>=threshold).astype(int); y_hat=(y_pred>=threshold).astype(int)
    f1=f1_score(y_bin, y_hat)
    return {'rmse':rmse, 'mae':mae, f'f1@{threshold}':f1}, y_pred

def thresholds_from_val(y_true, y_prob, n=5):
    qs=np.linspace(0.3,0.7,n)
    best_th=0.5; best_f1=0
    for th in qs:
        y_pred=(y_prob>=th).astype(int)
        f1=f1_score(y_true, y_pred)
        if f1>best_f1:
            best_f1=f1; best_th=th
    return best_th, best_f1


In [None]:

# 데이터 로드/라벨링
df_raw = load_interactions(DATA_PATH)
df_raw['label'] = (df_raw['rating']>=4).astype(int)
df, user2idx, item2idx = encode_ids(df_raw)
train_df, val_df, test_df = split_userwise(df, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO, seed=SEED)
num_users=len(user2idx); num_items=len(item2idx)


In [None]:

# 데이터로더 준비
train_loader_bin = DataLoader(PairDataset(train_df, 'label'), batch_size=BATCH_SIZE, shuffle=True)
val_loader_bin = DataLoader(PairDataset(val_df, 'label'), batch_size=BATCH_SIZE)
test_loader_bin = DataLoader(PairDataset(test_df, 'label'), batch_size=BATCH_SIZE)

train_loader_reg = DataLoader(PairDataset(train_df.assign(label=train_df['rating'])), batch_size=BATCH_SIZE, shuffle=True)
val_loader_reg = DataLoader(PairDataset(val_df.assign(label=val_df['rating'])), batch_size=BATCH_SIZE)
test_loader_reg = DataLoader(PairDataset(test_df.assign(label=test_df['rating'])), batch_size=BATCH_SIZE)


In [None]:

# 이진 학습 + early stopping + best threshold
clf = MFClassifier(num_users, num_items, EMBED_DIM).to(DEVICE)
opt = torch.optim.Adam(clf.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
best_auc=-1; patience=0; history_auc=[]; history_f1=[]; best_state=None; best_th=0.5
for epoch in range(1, EPOCHS_BIN+1):
    clf.train(); total=0.0
    for u,i,y in train_loader_bin:
        u=torch.tensor(u, device=DEVICE); i=torch.tensor(i, device=DEVICE); y=torch.tensor(y, device=DEVICE)
        logit = clf(u,i); loss = nn.functional.binary_cross_entropy_with_logits(logit, y)
        opt.zero_grad(); loss.backward(); opt.step(); total += loss.item()*u.shape[0]
    metrics, y_true_val, y_prob_val = eval_binary(clf, val_loader_bin, device=DEVICE, thresholds=[0.5])
    th_opt, f1_opt = thresholds_from_val(y_true_val, y_prob_val)
    history_auc.append(metrics['auc']); history_f1.append(metrics['f1@0.5'])
    if metrics['auc']>best_auc:
        best_auc=metrics['auc']; best_state={k:v.cpu() if torch.is_tensor(v) else v for k,v in clf.state_dict().items()}; patience=0; best_th=th_opt
    else:
        patience+=1
    print(f"[BIN] epoch={epoch} loss={total/len(train_loader_bin.dataset):.4f} auc={metrics['auc']:.4f} f1@0.5={metrics['f1@0.5']:.4f} th_opt={th_opt:.2f} (patience {patience}/{PATIENCE})")
    if patience>=PATIENCE:
        print('Early stopping bin'); break
if best_state:
    clf.load_state_dict(best_state)


In [None]:

# 회귀 학습 + early stopping
reg = MFRegressor(num_users, num_items, EMBED_DIM).to(DEVICE)
opt_r = torch.optim.Adam(reg.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
best_rmse=1e9; patience=0; hist_rmse=[]
for epoch in range(1, EPOCHS_REG+1):
    reg.train(); total=0.0
    for u,i,y in train_loader_reg:
        u=torch.tensor(u, device=DEVICE); i=torch.tensor(i, device=DEVICE); y=torch.tensor(y, device=DEVICE)
        pred = reg(u,i); loss = nn.functional.mse_loss(pred, y)
        opt_r.zero_grad(); loss.backward(); opt_r.step(); total += loss.item()*u.shape[0]
    metrics, _ = eval_reg(reg, val_loader_reg, device=DEVICE, threshold=4.0)
    hist_rmse.append(metrics['rmse'])
    if metrics['rmse']<best_rmse:
        best_rmse=metrics['rmse']; best_state={k:v.cpu() if torch.is_tensor(v) else v for k,v in reg.state_dict().items()}; patience=0
    else:
        patience+=1
    print(f"[REG] epoch={epoch} loss={total/len(train_loader_reg.dataset):.4f} rmse={metrics['rmse']:.4f} mae={metrics['mae']:.4f} f1@4={metrics['f1@4.0']:.4f} (patience {patience}/{PATIENCE})")
    if patience>=PATIENCE:
        print('Early stopping reg'); break
if best_state:
    reg.load_state_dict(best_state)


In [None]:

# 평가 (test) - 이진
metrics_val, y_true_val, y_prob_val = eval_binary(clf, val_loader_bin, device=DEVICE, thresholds=[best_th])
metrics_test, y_true_test, y_prob_test = eval_binary(clf, test_loader_bin, device=DEVICE, thresholds=[best_th])
print('BINARY VAL', metrics_val, 'best_th', best_th)
print('BINARY TEST', metrics_test)


In [None]:

# 평가 (test) - 회귀
metrics_val_reg, _ = eval_reg(reg, val_loader_reg, device=DEVICE, threshold=4.0)
metrics_test_reg, _ = eval_reg(reg, test_loader_reg, device=DEVICE, threshold=4.0)
print('REG VAL', metrics_val_reg)
print('REG TEST', metrics_test_reg)


In [None]:

# 시각화
plt.figure(figsize=(10,3))
plt.subplot(1,2,1); plt.plot(history_auc); plt.title('Bin Val AUC'); plt.xlabel('epoch')
plt.subplot(1,2,2); plt.plot(hist_rmse); plt.title('Reg Val RMSE'); plt.xlabel('epoch')
plt.tight_layout(); plt.savefig(MODEL_DIR/'codexb2_learning_curves.png'); plt.close()


In [None]:

# 체크포인트 저장
torch.save({'state_dict': clf.state_dict(), 'num_users': num_users, 'num_items': num_items, 'embed_dim': EMBED_DIM, 'type':'binary', 'best_th': best_th}, MODEL_DIR/'codexb2_binary.pth')
torch.save({'state_dict': reg.state_dict(), 'num_users': num_users, 'num_items': num_items, 'embed_dim': EMBED_DIM, 'type':'regression'}, MODEL_DIR/'codexb2_regression.pth')


In [None]:

# 추론 유틸

def predict_binary(model, user_enc, item_enc, csv_path: Path, threshold=0.5, return_prob=False):
    df_in=pd.read_csv(csv_path)
    users=df_in['user'].map(user_enc); items=df_in['item'].map(item_enc)
    if users.isnull().any() or items.isnull().any():
        raise ValueError('미등록 user/item')
    u=torch.tensor(users.to_numpy(), device=DEVICE); i=torch.tensor(items.to_numpy(), device=DEVICE)
    with torch.no_grad():
        prob=torch.sigmoid(model(u,i)).cpu().numpy()
    df_in['recommend']=np.where(prob>=threshold,'O','X')
    if return_prob:
        df_in['prob']=prob
    total_o=df_in['recommend'].eq('O').sum(); total=len(df_in)
    print(df_in)
    print(f"====================
Total recommends = {total_o}/{total}
Not recommend = {total-total_o}/{total}")
    return df_in

def predict_reg(model, user_enc, item_enc, csv_path: Path, threshold=4.0, return_score=False):
    df_in=pd.read_csv(csv_path)
    users=df_in['user'].map(user_enc); items=df_in['item'].map(item_enc)
    if users.isnull().any() or items.isnull().any():
        raise ValueError('미등록 user/item')
    u=torch.tensor(users.to_numpy(), device=DEVICE); i=torch.tensor(items.to_numpy(), device=DEVICE)
    with torch.no_grad():
        pred=model(u,i).cpu().numpy()
    df_in['recommend']=np.where(pred>=threshold,'O','X')
    if return_score:
        df_in['score']=pred
    total_o=df_in['recommend'].eq('O').sum(); total=len(df_in)
    print(df_in)
    print(f"====================
Total recommends = {total_o}/{total}
Not recommend = {total-total_o}/{total}")
    return df_in

# 예시
# predict_binary(clf, user2idx, item2idx, Path('../data/sample1.csv'), threshold=best_th, return_prob=True)
# predict_reg(reg, user2idx, item2idx, Path('../data/sample1.csv'), threshold=4.0, return_score=True)
