In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
import os
import gc

# 1. 환경 설정
DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"
SUB_PATH = "./submissions/"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
if not os.path.exists(OOF_PATH): os.makedirs(OOF_PATH)
if not os.path.exists(SUB_PATH): os.makedirs(SUB_PATH)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

# 2. 데이터 로드 및 전처리 (Train + Test 공통)
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test_x.csv')

# [이상치 제거 - Train 전용]
train_df = train_df[train_df['familysize'] <= 50]
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
train_df = train_df[train_df[a_cols].std(axis=1) > 0]

# [파생 변수 생성 함수]
def apply_fe(df):
    e_cols = [col for col in df.columns if col.endswith('E')]
    tp_cols = [f'tp{i:02d}' for i in range(1, 11)]
    reverse_cols = ['QaA', 'QdA', 'QeA', 'QfA', 'QgA', 'QiA', 'QkA', 'QnA', 'QqA', 'QrA', 'QtA']
    
    # 시간 Winzoring
    for col in e_cols:
        upper = df[col].quantile(0.99)
        df[col] = df[col].clip(lower=25, upper=upper)
    
    # Mach Score (역채점 반영)
    for col in a_cols:
        if col in reverse_cols: df[col + '_rev'] = 6 - df[col]
        else: df[col + '_rev'] = df[col]
    rev_a_cols = [col + '_rev' for col in a_cols]
    df['Mach_Score'] = df[rev_a_cols].mean(axis=1)
    
    # 통계 피처
    df['E_Mean'] = df[e_cols].mean(axis=1)
    df['E_Std'] = df[e_cols].std(axis=1)
    df['TP_Mean'] = df[tp_cols].mean(axis=1)
    df['TP_Std'] = df[tp_cols].std(axis=1)
    
    # 불필요 컬럼 제거
    drop_list = e_cols + ['index', 'hand'] + rev_a_cols
    if 'voted' in df.columns: drop_list.append('voted')
    
    replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}
    res = df.drop(drop_list, axis=1).astype(replace_dict)
    return pd.get_dummies(res)

# 피처 생성 적용
train_x = apply_fe(train_df)
test_x = apply_fe(test_df)
test_x = test_x.reindex(columns=train_x.columns, fill_value=0) # 컬럼 순서 맞춤

# 텐서 변환 및 스케일링
train_y = (2 - train_df['voted'].to_numpy()).astype(np.float32)
train_x_t = torch.tensor(train_x.to_numpy().astype(np.float32))
test_x_t = torch.tensor(test_x.to_numpy().astype(np.float32))

mean = train_x_t.mean(dim=0); std = train_x_t.std(dim=0) + 1e-7
train_x_std = (train_x_t - mean) / std
test_x_std = (test_x_t - mean) / std

# 3. 모델 및 학습 프로세스
class ImprovedMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.LeakyReLU(), nn.Dropout(0.2),
            nn.Linear(256, 64), nn.BatchNorm1d(64), nn.LeakyReLU(), nn.Dropout(0.1),
            nn.Linear(64, 1)
        )
    def forward(self, x): return self.net(x)

def run_m5_full():
    N_REPEAT, N_SKFOLD, N_EPOCH, BATCH_SIZE = 5, 7, 50, 72
    oof_preds = np.zeros(len(train_x_std))
    test_preds = np.zeros(len(test_x_std)) # 테스트 예측값 누적용
    
    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=42+r, shuffle=True)
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_std, train_y)):
            # 데이터 로더
            t_loader = DataLoader(TensorDataset(train_x_std[t_idx], torch.tensor(train_y[t_idx])), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
            v_loader = DataLoader(TensorDataset(train_x_std[v_idx]), batch_size=BATCH_SIZE, shuffle=False)
            test_loader = DataLoader(TensorDataset(test_x_std), batch_size=BATCH_SIZE, shuffle=False)
            
            model = ImprovedMLP(train_x_std.shape[1]).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
            criterion = nn.BCEWithLogitsLoss()
            
            best_auc = 0; best_v = None; best_test = None
            for epoch in range(N_EPOCH):
                model.train()
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE)).backward()
                    optimizer.step()
                
                # 검증
                model.eval()
                v_p = []; t_p = []
                with torch.no_grad():
                    for vx in v_loader: v_p.extend(torch.sigmoid(model(vx[0].to(DEVICE)).view(-1)).cpu().numpy())
                    auc = roc_auc_score(train_y[v_idx], v_p)
                    if auc > best_auc:
                        best_auc = auc; best_v = v_p
                        # 최적 에폭일 때 테스트 데이터도 미리 예측해둠
                        for tx in test_loader: t_p.extend(torch.sigmoid(model(tx[0].to(DEVICE)).view(-1)).cpu().numpy())
                        best_test = t_p
            
            oof_preds[v_idx] += np.array(best_v) / N_REPEAT
            test_preds += np.array(best_test) / (N_REPEAT * N_SKFOLD)
            print(f"R{r} F{f} | Best AUC: {best_auc:.5f}")
            del model; gc.collect()

    # 최종 저장
    total_auc = roc_auc_score(train_y, oof_preds)
    np.save(f"{OOF_PATH}exp20_m5_FE_AUC_{total_auc:.5f}.npy", oof_preds)
    
    sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    sub['voted'] = test_preds
    sub.to_csv(f"{SUB_PATH}m5_FE_test_preds.csv", index=False)
    print(f"\n[DONE] OOF AUC: {total_auc:.5f} | Test Preds Saved.")

run_m5_full()

R0 F0 | Best AUC: 0.77020
R0 F1 | Best AUC: 0.77764
R0 F2 | Best AUC: 0.76848
R0 F3 | Best AUC: 0.76774
R0 F4 | Best AUC: 0.77508
R0 F5 | Best AUC: 0.76763
R0 F6 | Best AUC: 0.76604
R1 F0 | Best AUC: 0.77391
R1 F1 | Best AUC: 0.76442
R1 F2 | Best AUC: 0.76995
R1 F3 | Best AUC: 0.77319
R1 F4 | Best AUC: 0.76651
R1 F5 | Best AUC: 0.77541
R1 F6 | Best AUC: 0.76836
R2 F0 | Best AUC: 0.77286
R2 F1 | Best AUC: 0.77014
R2 F2 | Best AUC: 0.76725
R2 F3 | Best AUC: 0.75844
R2 F4 | Best AUC: 0.77596
R2 F5 | Best AUC: 0.77773
R2 F6 | Best AUC: 0.77178
R3 F0 | Best AUC: 0.76229
R3 F1 | Best AUC: 0.77312
R3 F2 | Best AUC: 0.76719
R3 F3 | Best AUC: 0.77420
R3 F4 | Best AUC: 0.76965
R3 F5 | Best AUC: 0.77180
R3 F6 | Best AUC: 0.76592
R4 F0 | Best AUC: 0.77153
R4 F1 | Best AUC: 0.76812
R4 F2 | Best AUC: 0.76499
R4 F3 | Best AUC: 0.76876
R4 F4 | Best AUC: 0.77619
R4 F5 | Best AUC: 0.76902
R4 F6 | Best AUC: 0.77197

[DONE] OOF AUC: 0.77353 | Test Preds Saved.
