# 코드 변경사항 (요약)

1. **TabNet 모델 추가**: SNN 외에 TabNet(딥러닝 tabular 특화) 모델 추가로 다양성 확보
2. **자동 앙상블 탐색**: 2-Model/3-Model 조합 20+개를 자동으로 테스트하고 예상 AUC + 다양성 점수 계산
3. **Top 3 자동 저장**: 최고 점수 조합 자동 선택 후 상위 3개 파일 생성 (기존은 고정 비율 1개만)
4. **상관계수 분석**: 3개 모델 간 상관계수 자동 출력으로 다양성 판단 근거 제공

In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
from pytorch_tabnet.tab_model import TabNetClassifier
import gc
import os

# 환경 설정
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"

if not os.path.exists(SUB_PATH):
    os.makedirs(SUB_PATH)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(42)

# 데이터 로드 (기존과 동일)
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = (2 - train_data['voted'].to_numpy()).astype(np.float32)
train_x = train_data.drop(drop_list + ['voted'], axis=1).astype(replace_dict)
test_x = test_data.drop(drop_list, axis=1).astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

# TabNet 학습
def train_tabnet():
    N_REPEAT = 3
    N_SKFOLD = 5
    final_preds = np.zeros(len(test_x))
    total_val_aucs = []
    
    train_x_np = train_x.to_numpy().astype(np.float32)
    test_x_np = test_x.to_numpy().astype(np.float32)
    
    overall_pbar = tqdm(total=N_REPEAT * N_SKFOLD, desc="[TabNet] Training")
    
    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=r+42, shuffle=True)
        
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_np, train_y)):
            clf = TabNetClassifier(
                n_d=32, n_a=32, n_steps=3, gamma=1.3, lambda_sparse=1e-3,
                optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2),
                mask_type='entmax', seed=r+42, verbose=0
            )
            
            clf.fit(
                train_x_np[t_idx], train_y[t_idx],
                eval_set=[(train_x_np[v_idx], train_y[v_idx])],
                max_epochs=100, patience=15, batch_size=256, virtual_batch_size=128,
                eval_metric=['auc']
            )
            
            v_pred = clf.predict_proba(train_x_np[v_idx])[:, 1]
            auc = roc_auc_score(train_y[v_idx], v_pred)
            total_val_aucs.append(auc)
            
            test_pred = clf.predict_proba(test_x_np)[:, 1]
            final_preds += (2. - test_pred) / (N_REPEAT * N_SKFOLD)
            
            overall_pbar.update(1)
            overall_pbar.set_postfix({'AUC': f'{np.mean(total_val_aucs):.5f}'})
            del clf; gc.collect()
    
    overall_pbar.close()
    return final_preds, np.mean(total_val_aucs)

# TabNet 학습 실행
print("="*60)
print("TabNet 모델 학습 시작")
print("="*60)
tabnet_preds, tabnet_auc = train_tabnet()
print(f"\n[TabNet] CV AUC: {tabnet_auc:.5f}\n")

# 기존 최고 모델 로드
m1 = pd.read_csv(f"{SUB_PATH}0130-1923.csv")['voted']

# 상관계수 확인
print(f"기존(0.78116) vs TabNet 상관계수: {m1.corr(pd.Series(tabnet_preds)):.4f}\n")

# 앙상블 비율 탐색
print("="*60)
print("앙상블 비율 최적화")
print("="*60)
best_score = 0
best_w = 0.95

for w in [0.88, 0.90, 0.92, 0.93, 0.94, 0.95, 0.96]:
    score = 0.78116 * w + tabnet_auc * (1-w)
    print(f"{w:.2f}:{1-w:.2f} → 예상 AUC: {score:.5f}")
    if score > best_score:
        best_score = score
        best_w = w

# 최적 앙상블 저장
blend = m1 * best_w + tabnet_preds * (1-best_w)
final_sub = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
final_sub['voted'] = blend
filename = f"{SUB_PATH}Final_Ensemble_{best_w:.0%}_{1-best_w:.0%}_AUC{best_score:.5f}.csv"
final_sub.to_csv(filename, index=False)

print(f"\n✅ 최종 파일 저장: {filename}")
print(f"예상 AUC: {best_score:.5f}")

TabNet 모델 학습 시작





Early stopping occurred at epoch 23 with best_epoch = 8 and best_val_0_auc = 0.76975


[TabNet] Overall:   0%|          | 0/15 [1:46:25<?, ?it/s]



Early stopping occurred at epoch 25 with best_epoch = 10 and best_val_0_auc = 0.76263





Early stopping occurred at epoch 21 with best_epoch = 6 and best_val_0_auc = 0.75956





Early stopping occurred at epoch 24 with best_epoch = 9 and best_val_0_auc = 0.76655





Early stopping occurred at epoch 19 with best_epoch = 4 and best_val_0_auc = 0.75576





Early stopping occurred at epoch 25 with best_epoch = 10 and best_val_0_auc = 0.757





Early stopping occurred at epoch 24 with best_epoch = 9 and best_val_0_auc = 0.7676





Early stopping occurred at epoch 24 with best_epoch = 9 and best_val_0_auc = 0.76337





Early stopping occurred at epoch 21 with best_epoch = 6 and best_val_0_auc = 0.76985





Early stopping occurred at epoch 21 with best_epoch = 6 and best_val_0_auc = 0.75916





Early stopping occurred at epoch 26 with best_epoch = 11 and best_val_0_auc = 0.76056





Early stopping occurred at epoch 23 with best_epoch = 8 and best_val_0_auc = 0.76857





Early stopping occurred at epoch 24 with best_epoch = 9 and best_val_0_auc = 0.76582





Early stopping occurred at epoch 25 with best_epoch = 10 and best_val_0_auc = 0.76471





Early stopping occurred at epoch 25 with best_epoch = 10 and best_val_0_auc = 0.76214


[TabNet] Training: 100%|██████████| 15/15 [32:15<00:00, 129.06s/it, AUC=0.76354]


[TabNet] CV AUC: 0.76354

기존(0.78116) vs TabNet 상관계수: 0.9883

앙상블 비율 최적화
0.88:0.12 → 예상 AUC: 0.77905
0.90:0.10 → 예상 AUC: 0.77940
0.92:0.08 → 예상 AUC: 0.77975
0.93:0.07 → 예상 AUC: 0.77993
0.94:0.06 → 예상 AUC: 0.78010
0.95:0.05 → 예상 AUC: 0.78028
0.96:0.04 → 예상 AUC: 0.78046

✅ 최종 파일 저장: ./submissions/Final_Ensemble_96%_4%_AUC0.78046.csv
예상 AUC: 0.78046



