In [3]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

# rtdl 라이브러리 활용
from rtdl_revisiting_models import FTTransformer

# --- [1] 환경 설정 ---
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

seed_everything(0)
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"
SUB_PATH = "./submissions/"
os.makedirs(OOF_PATH, exist_ok=True)

# --- [2] m8 데이터 전처리 파이프라인 ---
def get_m8_data():
    train = pd.read_csv(f"{DATA_PATH}train.csv")
    test = pd.read_csv(f"{DATA_PATH}test_x.csv")
    
    # m7 Filtering 기준 계승
    qa_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
    train = train[train.familysize <= 50].copy()
    train = train[train[qa_cols].std(axis=1) > 0].reset_index(drop=True)
    
    # Target Mapping (Large = Voted)
    train['voted'] = train['voted'].replace({2: 1, 1: 0})
    
    def engineering(df):
        # Q_E 로그 변환 및 Clipping (exp03 기준)
        qe_cols = [f'Q{i}E' for i in 'abcdefghijklmnopqrst']
        for col in qe_cols:
            df[col] = np.log1p(df[col].clip(upper=df[col].quantile(0.99)))
        
        # 심리 지표 (m7 기준)
        df['Q_Var'] = df[qa_cols].var(axis=1)
        df['mach_score'] = df[qa_cols].mean(axis=1)
        df['conflict_index'] = (df['QqA'] - df['QcA']).abs()
        
        # tp/wr/wf 파생 변수 (exp03 기준)
        df['tp_Openness'] = (df['tp05'] + (7 - df['tp10'])) / 2
        df['wr_total'] = df[[f'wr_{i:02d}' for i in range(1, 14)]].sum(axis=1)
        df['wf_total'] = df[[f'wf_{i:02d}' for i in range(1, 4)]].sum(axis=1)
        
        # 연령대 수치화
        df['age_encoded'] = df['age_group'].str.extract('(\d+)').astype(float).fillna(60)
        return df

    train = engineering(train)
    test = engineering(test)
    
    cat_cols = ['race', 'religion', 'urban', 'education', 'hand', 'married', 'engnat', 'gender']
    drop_list = [f'tp{i:02d}' for i in range(1, 11)] + ['age_group', 'index', 'voted']
    num_cols = [c for c in train.columns if c not in cat_cols and c not in drop_list]
    
    # Label Encoding (범주형 임베딩용)
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]]).astype(str)
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    
    # StandardScaler (Transformer 안정성 확보)
    scaler = StandardScaler()
    train[num_cols] = scaler.fit_transform(train[num_cols])
    test[num_cols] = scaler.transform(test[num_cols])
    
    cardinalities = [train[col].nunique() for col in cat_cols]
    
    return train, test, num_cols, cat_cols, cardinalities

# --- [3] 학습 및 결과 생성 ---
train_df, test_df, num_cols, cat_cols, cardinalities = get_m8_data()

skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=0)
oof_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))

# exp03 Trial 13 최적 파라미터
PARAMS = {
    'lr': 0.000975,
    'batch_size': 256,
    'n_blocks': 2,
    'd_block': 128,
    'attention_dropout': 0.2999,
    'ffn_dropout': 0.2310
}

for fold, (t_idx, v_idx) in enumerate(skf.split(train_df, train_df['voted'])):
    x_num_t = torch.tensor(train_df.iloc[t_idx][num_cols].values, dtype=torch.float32)
    x_cat_t = torch.tensor(train_df.iloc[t_idx][cat_cols].values, dtype=torch.long)
    y_t = torch.tensor(train_df.iloc[t_idx]['voted'].values, dtype=torch.float32).view(-1, 1)
    
    train_loader = DataLoader(TensorDataset(x_num_t, x_cat_t, y_t), batch_size=PARAMS['batch_size'], shuffle=True)
    
    model = FTTransformer(
        n_cont_features=len(num_cols),
        cat_cardinalities=cardinalities,
        d_out=1,
        _is_default=False,
        n_blocks=PARAMS['n_blocks'],
        d_block=PARAMS['d_block'],
        attention_n_heads=8,
        ffn_d_hidden_multiplier=4/3,
        attention_dropout=PARAMS['attention_dropout'],
        ffn_dropout=PARAMS['ffn_dropout'],
        residual_dropout=0.0,
    ).to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=PARAMS['lr'], weight_decay=1e-2)
    criterion = nn.BCEWithLogitsLoss()
    
    best_auc = 0
    for epoch in range(25): # exp03 최적 에포크 적용
        model.train()
        for b_n, b_c, b_y in train_loader:
            b_n, b_c, b_y = b_n.to(DEVICE), b_c.to(DEVICE), b_y.to(DEVICE)
            optimizer.zero_grad()
            loss = criterion(model(b_n, b_c), b_y)
            loss.backward()
            optimizer.step()
            
        model.eval()
        with torch.no_grad():
            v_n = torch.tensor(train_df.iloc[v_idx][num_cols].values, dtype=torch.float32).to(DEVICE)
            v_c = torch.tensor(train_df.iloc[v_idx][cat_cols].values, dtype=torch.long).to(DEVICE)
            v_probs = torch.sigmoid(model(v_n, v_c).squeeze()).cpu().numpy()
            auc = roc_auc_score(train_df.iloc[v_idx]['voted'], v_probs)
            
            if auc > best_auc:
                best_auc = auc
                oof_preds[v_idx] = v_probs
                
                # Test 예측 (Large = Voted)
                test_num_t = torch.tensor(test_df[num_cols].values, dtype=torch.float32).to(DEVICE)
                test_cat_t = torch.tensor(test_df[cat_cols].values, dtype=torch.long).to(DEVICE)
                test_preds += torch.sigmoid(model(test_num_t, test_cat_t).squeeze()).cpu().numpy() / 7
                
    print(f"Fold {fold+1} Best AUC: {best_auc:.5f}")

# --- [4] 파일 저장 ---
total_auc = roc_auc_score(train_df['voted'], oof_preds)
print(f"Final OOF AUC: {total_auc:.5f}")

# OOF 저장: exp번호_모델명_AUC_점수.npy
np.save(f"{OOF_PATH}exp28_m8_ft_AUC_{total_auc:.5f}.npy", oof_preds)

# CSV 저장: 번호_모델명_점수.csv
sub = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
sub['voted'] = 2.0 - test_preds # m1과 동일한 Small = Voted 방향 보정
output_name = f"{SUB_PATH}28_m8_ft_{total_auc:.5f}.csv"
sub.to_csv(output_name, index=False)

Fold 1 Best AUC: 0.76268
Fold 2 Best AUC: 0.77388
Fold 3 Best AUC: 0.77160
Fold 4 Best AUC: 0.76268
Fold 5 Best AUC: 0.77115
Fold 6 Best AUC: 0.76522
Fold 7 Best AUC: 0.77198
Final OOF AUC: 0.76664
