# A버전 codexa2 (LightGCN, implicit)
- rating 무시, 유저별 8/1/1, popularity 네거티브 혼합
- BPR + Recall/NDCG@K + 규칙기반 추천수 평가
- early stopping + 베스트 checkpoint + 시각화

In [None]:

import sys
from pathlib import Path
sys.path.append('..')

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import scipy.sparse as sp

# 설정
DATA_PATH = Path('../data/train.csv')
MODEL_DIR = Path('../codex_models'); MODEL_DIR.mkdir(parents=True, exist_ok=True)
SEED = 42
TRAIN_RATIO = 0.8; VAL_RATIO = 0.1
EMBED_DIM = 64
N_LAYERS = 3
LR = 1e-3
WEIGHT_DECAY = 1e-5
BATCH_SIZE = 2048
EPOCHS = 50
POP_RATIO = 0.7
N_NEG = 2
K_EVALS = [10, 20]
PATIENCE = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(SEED); torch.manual_seed(SEED)


In [None]:

def load_interactions(path: Path):
    return pd.read_csv(path)

def encode_ids(df):
    users = sorted(df['user'].unique()); items = sorted(df['item'].unique())
    user2idx = {u:i for i,u in enumerate(users)}
    item2idx = {v:i for i,v in enumerate(items)}
    out = df.copy()
    out['user_idx'] = out['user'].map(user2idx)
    out['item_idx'] = out['item'].map(item2idx)
    return out, user2idx, item2idx

def split_userwise(df, train_ratio=0.8, val_ratio=0.1, seed=42):
    rng = np.random.default_rng(seed)
    trains=[]; vals=[]; tests=[]
    for _, g in df.groupby('user_idx'):
        idx = rng.permutation(len(g)); g=g.iloc[idx]
        n=len(g); n_train=int(n*train_ratio); n_val=int(n*val_ratio)
        trains.append(g.iloc[:n_train]); vals.append(g.iloc[n_train:n_train+n_val]); tests.append(g.iloc[n_train+n_val:])
    return pd.concat(trains, ignore_index=True), pd.concat(vals, ignore_index=True), pd.concat(tests, ignore_index=True)

def build_user_pos(df):
    return df.groupby('user_idx')['item_idx'].agg(lambda x:set(x.tolist())).to_dict()

def item_popularity(df, n_items):
    counts = df['item_idx'].value_counts()
    freq = np.ones(n_items)*1e-8
    freq[counts.index] = counts.values
    return freq/freq.sum()

def make_norm_adj(num_users, num_items, edges):
    rows = edges['user_idx'].to_numpy(); cols = edges['item_idx'].to_numpy() + num_users
    data = np.ones(len(edges), dtype=np.float32)
    mat = sp.coo_matrix((data,(rows, cols)), shape=(num_users+num_items, num_users+num_items))
    mat = mat + mat.T
    deg = np.array(mat.sum(axis=1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5); deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.0
    norm = sp.diags(deg_inv_sqrt) @ mat @ sp.diags(deg_inv_sqrt)
    norm = norm.tocoo()
    indices = torch.tensor(np.vstack((norm.row, norm.col)), dtype=torch.long)
    values = torch.tensor(norm.data, dtype=torch.float32)
    return torch.sparse_coo_tensor(indices, values, size=norm.shape).coalesce()

def recall_at_k(ranked, gt, k):
    if not gt: return 0.0
    hit=sum(1 for i in ranked[:k] if i in gt)
    return hit/min(k,len(gt))

def ndcg_at_k(ranked, gt, k):
    dcg=0.0
    for idx,item in enumerate(ranked[:k]):
        if item in gt:
            dcg += 1/np.log2(idx+2)
    ideal=min(len(gt),k); idcg=sum(1/np.log2(i+2) for i in range(ideal))
    return dcg/idcg if idcg>0 else 0.0

def rule_k(count):
    if count<=10: return max(2,1)
    return max(int(np.floor(0.2*count)),1)


In [None]:

class BPRDataset(Dataset):
    def __init__(self, df, num_items, user_pos, pop_prob, n_neg=1, pop_ratio=0.7, seed=42):
        self.users=df['user_idx'].to_numpy(); self.pos=df['item_idx'].to_numpy(); self.num_items=num_items
        self.user_pos=user_pos; self.pop_prob=pop_prob; self.n_neg=n_neg; self.pop_ratio=pop_ratio
        self.rng=np.random.default_rng(seed)
    def __len__(self): return len(self.users)
    def __getitem__(self, idx):
        u=int(self.users[idx]); i_pos=int(self.pos[idx]); seen=self.user_pos.get(u,set()); neg=None
        while True:
            if self.rng.random()<self.pop_ratio:
                j=int(self.rng.choice(self.num_items, p=self.pop_prob))
            else:
                j=int(self.rng.integers(0,self.num_items))
            if j not in seen:
                neg=j; break
        return u, i_pos, neg

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embed_dim, n_layers, adj):
        super().__init__(); self.num_users=num_users; self.num_items=num_items; self.n_layers=n_layers; self.adj=adj
        self.emb = nn.Embedding(num_users+num_items, embed_dim)
        nn.init.normal_(self.emb.weight, std=0.1)
    def forward(self):
        x=self.emb.weight; embs=[x]
        for _ in range(self.n_layers):
            x=torch.sparse.mm(self.adj, x); embs.append(x)
        out=torch.stack(embs, dim=0).mean(dim=0)
        return out[:self.num_users], out[self.num_users:]


In [None]:

def bpr_loss(u, i, j, reg=1e-4):
    pos=(u*i).sum(dim=1); neg=(u*j).sum(dim=1)
    loss = -torch.log(torch.sigmoid(pos-neg)+1e-8).mean()
    reg_loss = reg*(u.norm(2).pow(2)+i.norm(2).pow(2)+j.norm(2).pow(2))/u.shape[0]
    return loss+reg_loss

def evaluate(model, user_pos_train, eval_df, k_list, device='cpu'):
    model.eval(); u_emb,i_emb=model.forward(); u_emb=u_emb.to(device); i_emb=i_emb.to(device)
    user_pos_eval = build_user_pos(eval_df)
    metrics={k:{'rec':[],'ndcg':[]} for k in k_list}
    for u, gt in user_pos_eval.items():
        scores = (u_emb[u] @ i_emb.T)
        seen=user_pos_train.get(u,set())
        if seen:
            scores[list(seen)] = -1e9
        ranked = torch.topk(scores, k=max(k_list)).indices.cpu().numpy()
        for k in k_list:
            metrics[k]['rec'].append(recall_at_k(ranked, gt, k))
            metrics[k]['ndcg'].append(ndcg_at_k(ranked, gt, k))
    out={}
    for k,v in metrics.items():
        out[k]=(float(np.mean(v['rec'])), float(np.mean(v['ndcg'])))
    return out

def evaluate_rule(model, user_pos_train, eval_df, device='cpu'):
    model.eval(); u_emb,i_emb=model.forward(); u_emb=u_emb.to(device); i_emb=i_emb.to(device)
    user_pos_eval = build_user_pos(eval_df)
    recalls=[]; hits=[]
    for u, gt in user_pos_eval.items():
        scores = (u_emb[u] @ i_emb.T)
        seen=user_pos_train.get(u,set())
        if seen:
            scores[list(seen)] = -1e9
        k = rule_k(len(user_pos_train.get(u,set())))
        ranked = torch.topk(scores, k=k).indices.cpu().numpy()
        hit = len([x for x in ranked if x in gt])
        hits.append(hit)
        recalls.append(hit/max(len(gt),1))
    return float(np.mean(recalls)), float(np.mean(hits))


In [None]:

# 데이터 준비
raw = load_interactions(DATA_PATH)
df, user2idx, item2idx = encode_ids(raw)
train_df, val_df, test_df = split_userwise(df, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO, seed=SEED)
num_users=len(user2idx); num_items=len(item2idx)
user_pos_train = build_user_pos(train_df)
pop_prob = item_popularity(train_df, num_items)
adj = make_norm_adj(num_users, num_items, train_df).to(DEVICE)

train_ds = BPRDataset(train_df, num_items, user_pos_train, pop_prob, n_neg=N_NEG, pop_ratio=POP_RATIO, seed=SEED)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
model = LightGCN(num_users, num_items, EMBED_DIM, N_LAYERS, adj).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)


In [None]:

# 학습 루프 + early stopping
best_val = -1; patience_counter=0; history={'loss':[], 'val':[]}
best_state=None
for epoch in range(1, EPOCHS+1):
    model.train(); total=0.0
    for u,i,j in train_loader:
        u=u.to(DEVICE); i=i.to(DEVICE); j=j.to(DEVICE)
        user_emb,item_emb = model.forward()
        loss = bpr_loss(user_emb[u], item_emb[i], item_emb[j])
        opt.zero_grad(); loss.backward(); opt.step()
        total += loss.item()*u.shape[0]
    avg_loss = total/len(train_loader.dataset)
    val_metrics = evaluate(model, user_pos_train, val_df, K_EVALS, device=DEVICE)
    history['loss'].append(avg_loss); history['val'].append(val_metrics)
    best_k = K_EVALS[0]; val_recall = val_metrics[best_k][0]
    if val_recall > best_val:
        best_val = val_recall; best_state = {k:v.cpu() if torch.is_tensor(v) else v for k,v in model.state_dict().items()}; patience_counter=0
    else:
        patience_counter +=1
    print(f"Epoch {epoch} loss={avg_loss:.4f} val@{best_k} recall={val_metrics[best_k][0]:.4f} ndcg={val_metrics[best_k][1]:.4f} (patience {patience_counter}/{PATIENCE})")
    if patience_counter>=PATIENCE:
        print('Early stopping'); break
# restore best
if best_state:
    model.load_state_dict(best_state)


In [None]:

# 평가 (val/test, 규칙기반 포함)
for split_name, data in [('val', val_df), ('test', test_df)]:
    metrics = evaluate(model, user_pos_train, data, K_EVALS, device=DEVICE)
    rule_rec, rule_hit = evaluate_rule(model, user_pos_train, data, device=DEVICE)
    print(f"[{split_name.upper()}] rule recall={rule_rec:.4f} hit={rule_hit:.4f}")
    for k,v in metrics.items():
        print(f"[{split_name.upper()}] k={k} recall={v[0]:.4f} ndcg={v[1]:.4f}")


In [None]:

# 시각화: loss + val recall@K_EVALS[0]
ks = K_EVALS[0]
recalls=[v[ks][0] for v in history['val']]
plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.plot(history['loss']); plt.title('Train Loss'); plt.xlabel('epoch')
plt.subplot(1,2,2); plt.plot(recalls); plt.title(f'Val Recall@{ks}'); plt.xlabel('epoch')
plt.tight_layout();
plt.savefig(MODEL_DIR/'codexa2_learning_curves.png'); plt.close()
print('saved plot to', MODEL_DIR/'codexa2_learning_curves.png')


In [None]:

# 저장
torch.save({'state_dict': model.state_dict(), 'num_users': num_users, 'num_items': num_items, 'embed_dim': EMBED_DIM, 'n_layers': N_LAYERS}, MODEL_DIR/'codexa2_lightgcn.pth')


In [None]:

# 추론 유틸
def predict_ox(model, user_enc, item_enc, csv_path: Path, k=10, return_score=False):
    df_in = pd.read_csv(csv_path)
    users = df_in['user'].map(user_enc); items = df_in['item'].map(item_enc)
    if users.isnull().any() or items.isnull().any():
        raise ValueError('미등록 user/item 존재')
    user_emb, item_emb = model.forward(); user_emb=user_emb.to(DEVICE); item_emb=item_emb.to(DEVICE)
    results=[]; scores_list=[]
    with torch.no_grad():
        for u,i in zip(users.to_numpy(), items.to_numpy()):
            scores = (user_emb[u] @ item_emb.T)
            topk = torch.topk(scores, k=k).indices.cpu().numpy().tolist()
            score = float((user_emb[u]*item_emb[i]).sum().cpu())
            scores_list.append(score); results.append('O' if i in topk else 'X')
    df_in['recommend']=results
    if return_score:
        df_in['score']=scores_list
    total_o=df_in['recommend'].eq('O').sum(); total=len(df_in)
    print(df_in)
    print(f"====================
Total recommends = {total_o}/{total}
Not recommend = {total-total_o}/{total}")
    return df_in

# 예시
# predict_ox(model, user2idx, item2idx, Path('../data/sample1.csv'), k=10, return_score=True)
