In [2]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datetime import datetime
import gc

# 1. 환경 설정 및 데이터 로드 (91개 피처 로직 완벽 유지)
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

train_data = pd.read_csv('../../data/raw/train.csv')
test_data = pd.read_csv('../../data/raw/test_x.csv')

train_data = train_data.drop(train_data[train_data.familysize > 50].index)

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_y = 2 - train_data['voted'].to_numpy() # voted 1->1, 2->0
train_x = train_data.drop(drop_list + ['voted'], axis=1).astype(replace_dict)
test_x = test_data.drop(drop_list, axis=1).astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

# 수동 스케일링 (0.78116 성공 로직)
train_x_np = train_x.to_numpy().astype(float)
test_x_np = test_x.to_numpy().astype(float)

train_x_np[:, :20] = (train_x_np[:, :20] - 3.) / 2.
test_x_np[:, :20] = (test_x_np[:, :20] - 3.) / 2.
train_x_np[:, 20] = (train_x_np[:, 20] - 5.) / 4.
test_x_np[:, 20] = (test_x_np[:, 20] - 5.) / 4.
train_x_np[:, 21:31] = (train_x_np[:, 21:31] - 3.5) / 3.5
test_x_np[:, 21:31] = (test_x_np[:, 21:31] - 3.5) / 3.5

# 2. TabNet 설정 및 학습 (스케줄러 오류 수정)
N_SKFOLD = 5
skf = StratifiedKFold(n_splits=N_SKFOLD, shuffle=True, random_state=42)

final_predictions = np.zeros(len(test_x_np))
fold_aucs = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train_x_np, train_y)):
    X_train, y_train = train_x_np[train_idx], train_y[train_idx]
    X_valid, y_valid = train_x_np[valid_idx], train_y[valid_idx]
    
    # TabNet 모델 정의 (StepLR 사용으로 'metrics' 에러 원천 봉쇄)
    clf = TabNetClassifier(
        n_d=32, n_a=32, n_steps=3, # n_steps를 조금 줄여 일반화 성능 강화
        gamma=1.3, lambda_sparse=1e-3,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=1e-2),
        scheduler_fn=torch.optim.lr_scheduler.StepLR, # 'metrics' 인자가 필요 없는 안전한 스케줄러
        scheduler_params=dict(step_size=10, gamma=0.9),
        mask_type='sparsemax',
        device_name=DEVICE
    )
    
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        eval_name=['valid'],
        eval_metric=['auc'],
        max_epochs=100, 
        patience=15,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0,
        drop_last=False
    )
    
    valid_preds = clf.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, valid_preds)
    fold_aucs.append(auc_score)
    print(f"Fold {fold+1} Validation AUC: {auc_score:.5f}")
    
    # 0.78116 방식의 확률값 누적
    final_predictions += clf.predict_proba(test_x_np)[:, 1] / N_SKFOLD
    
    del clf; gc.collect()

# 3. 결과 출력 및 '2 - 확률' 로직 적용 제출
mean_auc = np.mean(fold_aucs)
print("\n" + "="*40)
print(f"최종 평균 Validation AUC: {mean_auc:.5f}")
print("="*40)

submission = pd.read_csv('../../data/raw/sample_submission.csv')

# [핵심] 0.78116의 '2. - sigmoid' 결과를 재현하기 위한 변환
# TabNet의 1(voted) 확률이 1에 가까울수록 결과는 1(voted)에 가깝게, 0에 가까울수록 2(not voted)에 가깝게 만듭니다.
submission['voted'] = 2.0 - final_predictions 

filename = f"./submissions/TabNet_Fixed_{datetime.now().strftime('%m%d-%H%M')}_AUC_{mean_auc:.4f}.csv"
submission.to_csv(filename, index=False)
print(f"제출 파일이 생성되었습니다: {filename}")



epoch 0  | loss: 0.73121 | valid_auc: 0.67094 |  0:00:03s




epoch 1  | loss: 0.63003 | valid_auc: 0.72127 |  0:00:06s




epoch 2  | loss: 0.60482 | valid_auc: 0.7358  |  0:00:09s




epoch 3  | loss: 0.59374 | valid_auc: 0.73998 |  0:00:11s




epoch 4  | loss: 0.58492 | valid_auc: 0.74479 |  0:00:14s




epoch 5  | loss: 0.58098 | valid_auc: 0.74702 |  0:00:17s




epoch 6  | loss: 0.57643 | valid_auc: 0.74827 |  0:00:20s




epoch 7  | loss: 0.5758  | valid_auc: 0.7536  |  0:00:23s




epoch 8  | loss: 0.57245 | valid_auc: 0.75508 |  0:00:26s




epoch 9  | loss: 0.57103 | valid_auc: 0.75543 |  0:00:29s




epoch 10 | loss: 0.56985 | valid_auc: 0.75344 |  0:00:32s




epoch 11 | loss: 0.56899 | valid_auc: 0.7563  |  0:00:35s




epoch 12 | loss: 0.56703 | valid_auc: 0.75213 |  0:00:38s




epoch 13 | loss: 0.56637 | valid_auc: 0.75618 |  0:00:41s




epoch 14 | loss: 0.56336 | valid_auc: 0.75405 |  0:00:44s




epoch 15 | loss: 0.56158 | valid_auc: 0.75685 |  0:00:47s




epoch 16 | loss: 0.55856 | valid_auc: 0.75426 |  0:00:50s




epoch 17 | loss: 0.55664 | valid_auc: 0.75666 |  0:00:53s




epoch 18 | loss: 0.55468 | valid_auc: 0.75795 |  0:00:56s




epoch 19 | loss: 0.55255 | valid_auc: 0.76093 |  0:00:59s




epoch 20 | loss: 0.5501  | valid_auc: 0.75774 |  0:01:02s




epoch 21 | loss: 0.54691 | valid_auc: 0.75851 |  0:01:05s




epoch 22 | loss: 0.54551 | valid_auc: 0.75959 |  0:01:08s




epoch 23 | loss: 0.54138 | valid_auc: 0.75762 |  0:01:10s




epoch 24 | loss: 0.5441  | valid_auc: 0.7599  |  0:01:13s




epoch 25 | loss: 0.53932 | valid_auc: 0.76048 |  0:01:16s




epoch 26 | loss: 0.53732 | valid_auc: 0.75716 |  0:01:19s




epoch 27 | loss: 0.53425 | valid_auc: 0.75838 |  0:01:22s




epoch 28 | loss: 0.53282 | valid_auc: 0.7606  |  0:01:25s




epoch 29 | loss: 0.52972 | valid_auc: 0.7584  |  0:01:28s




epoch 30 | loss: 0.52581 | valid_auc: 0.75792 |  0:01:31s




epoch 31 | loss: 0.52133 | valid_auc: 0.75453 |  0:01:34s




epoch 32 | loss: 0.52144 | valid_auc: 0.75456 |  0:01:37s




epoch 33 | loss: 0.51646 | valid_auc: 0.75162 |  0:01:40s




epoch 34 | loss: 0.51472 | valid_auc: 0.75433 |  0:01:43s

Early stopping occurred at epoch 34 with best_epoch = 19 and best_valid_auc = 0.76093




TypeError: Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support float64. Please use float32 instead.