In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
from rtdl_revisiting_models import FTTransformer
from autogluon.tabular import TabularPredictor

# [1] 환경 설정 및 시드 고정
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available(): torch.mps.manual_seed(seed)

seed_everything(SEED)
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"

# [2] 전처리 파이프라인 (MLP용 & FT용 분리)
def get_smoothed_map(train_df, col, target_col, m=10):
    overall_mean = train_df[target_col].mean()
    stats = train_df.groupby(col)[target_col].agg(['count', 'mean'])
    smooth_map = (stats['count'] * stats['mean'] + m * overall_mean) / (stats['count'] + m)
    return smooth_map, overall_mean

def load_and_preprocess():
    train = pd.read_csv(DATA_PATH + 'train.csv')
    test = pd.read_csv(DATA_PATH + 'test_x.csv')
    train['voted'] = train['voted'].replace({2: 1, 1: 0})

    def common_engineering(df):
        qa_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
        df['mach_score'] = df[qa_cols].mean(axis=1)
        qe_cols = [f'Q{i}E' for i in 'abcdefghijklmnopqrst']
        for col in qe_cols:
            df[col] = np.log1p(df[col].clip(upper=df[col].quantile(0.99)))
        df['familysize'] = df['familysize'].clip(upper=df['familysize'].quantile(0.99))
        df['tp_Extraversion'] = (df['tp01'] + (7 - df['tp06'])) / 2
        df['tp_Agreeableness'] = ((7 - df['tp02']) + df['tp07']) / 2
        df['tp_Conscientiousness'] = (df['tp03'] + (7 - df['tp08'])) / 2
        df['tp_EmotionalStability'] = ((7 - df['tp04']) + df['tp09']) / 2
        df['tp_Openness'] = (df['tp05'] + (7 - df['tp10'])) / 2
        df['wr_total'] = df[[f'wr_{i:02d}' for i in range(1, 14)]].sum(axis=1)
        df['wf_total'] = df[[f'wf_{i:02d}' for i in range(1, 4)]].sum(axis=1)
        df['gender_val'] = df['gender'].map({'Male': 0, 'Female': 1})
        df['age_encoded'] = df['age_group'].str.extract('(\d+)').astype(float).fillna(60).replace(0, 60)
        return df

    train = common_engineering(train)
    test = common_engineering(test)

    # --- MLP용 전처리 (Target Encoding) ---
    target_enc_cols = ['race', 'religion', 'urban', 'education', 'hand', 'married', 'engnat']
    train_mlp = train.copy()
    test_mlp = test.copy()
    for col in target_enc_cols:
        smooth_map, global_mean = get_smoothed_map(train, col, 'voted', m=10)
        train_mlp[f'{col}_enc'] = train[col].map(smooth_map)
        test_mlp[f'{col}_enc'] = test[col].map(smooth_map).fillna(global_mean)

    # --- FT용 전처리 (Label Encoding) ---
    cat_cols = ['race', 'religion', 'urban', 'education', 'hand', 'married', 'engnat', 'gender']
    train_ft = train.copy()
    test_ft = test.copy()
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col].astype(str), test[col].astype(str)])
        le.fit(combined)
        train_ft[col] = le.transform(train_ft[col].astype(str))
        test_ft[col] = le.transform(test_ft[col].astype(str))

    # 컬럼 드랍 리스트
    drop_list = [f'tp{i:02d}' for i in range(1, 11)] + [f'wr_{i:02d}' for i in range(1, 14)] + \
                [f'wf_{i:02d}' for i in range(1, 4)] + ['age_group', 'index', 'gender']
    
    # MLP 최종 데이터
    X_mlp = train_mlp.drop(columns=['voted'] + drop_list + target_enc_cols, errors='ignore')
    X_test_mlp = test_mlp.drop(columns=drop_list + target_enc_cols, errors='ignore')
    
    # FT 최종 데이터
    X_ft_all = train_ft.drop(columns=['voted'] + drop_list, errors='ignore')
    num_cols = [c for c in X_ft_all.columns if c not in cat_cols]
    cardinalities = [train_ft[col].nunique() for col in cat_cols]

    return X_mlp, X_test_mlp, train_ft[num_cols], test_ft[num_cols], train_ft[cat_cols], test_ft[cat_cols], train['voted'], cardinalities

# 데이터 로드 실행
X_mlp, X_test_mlp, ft_num, ft_test_num, ft_cat, ft_test_cat, y, cardinalities = load_and_preprocess()

# 스케일링
sc_mlp = StandardScaler()
X_mlp_sc = sc_mlp.fit_transform(X_mlp)
X_test_mlp_sc = sc_mlp.transform(X_test_mlp)

sc_ft = StandardScaler()
ft_num_sc = sc_ft.fit_transform(ft_num)
ft_test_num_sc = sc_ft.transform(ft_test_num)

# [3] OOF 추출 (5-Fold)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
mlp_oof = np.zeros(len(y))
ft_oof = np.zeros(len(y))
mlp_test_preds = np.zeros(len(X_test_mlp))
ft_test_preds = np.zeros(len(ft_test_num))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_mlp_sc, y)):
    print(f"\nFold {fold+1}/{n_splits} 학습 시작...")
    
    # --- MLP Fold 학습 ---
    X_tr_m = torch.tensor(X_mlp_sc[train_idx], dtype=torch.float32).to(DEVICE)
    y_tr_m = torch.tensor(y.values[train_idx], dtype=torch.float32).view(-1, 1).to(DEVICE)
    X_va_m = torch.tensor(X_mlp_sc[val_idx], dtype=torch.float32).to(DEVICE)
    
    mlp_model = nn.Sequential(
        nn.Linear(X_mlp_sc.shape[1], 256), nn.LeakyReLU(0.05), nn.Dropout(0.3),
        nn.Linear(256, 32), nn.ReLU(), nn.Linear(32, 1)
    ).to(DEVICE)
    
    opt_m = optim.AdamW(mlp_model.parameters(), lr=0.001)
    crit = nn.BCEWithLogitsLoss()
    
    loader_m = DataLoader(TensorDataset(X_tr_m, y_tr_m), batch_size=1024, shuffle=True)
    for _ in range(25):
        mlp_model.train()
        for bx, by in loader_m:
            opt_m.zero_grad(); crit(mlp_model(bx), by).backward(); opt_m.step()
    
    mlp_model.eval()
    with torch.no_grad():
        mlp_oof[val_idx] = torch.sigmoid(mlp_model(X_va_m).squeeze()).cpu().numpy()
        mlp_test_preds += torch.sigmoid(mlp_model(torch.tensor(X_test_mlp_sc, dtype=torch.float32).to(DEVICE)).squeeze()).cpu().numpy() / n_splits

    # --- FT-Transformer Fold 학습 ---
    X_n_tr = torch.tensor(ft_num_sc[train_idx], dtype=torch.float32).to(DEVICE)
    X_c_tr = torch.tensor(ft_cat.values[train_idx], dtype=torch.long).to(DEVICE)
    y_tr_f = torch.tensor(y.values[train_idx], dtype=torch.float32).view(-1, 1).to(DEVICE)
    X_n_va = torch.tensor(ft_num_sc[val_idx], dtype=torch.float32).to(DEVICE)
    X_c_va = torch.tensor(ft_cat.values[val_idx], dtype=torch.long).to(DEVICE)
    
    ft_model = FTTransformer(
        n_cont_features=ft_num_sc.shape[1], cat_cardinalities=cardinalities, d_out=1,
        _is_default=False, n_blocks=3, d_block=128, attention_n_heads=8,
        ffn_d_hidden_multiplier=4/3, attention_dropout=0.2, ffn_dropout=0.2, residual_dropout=0.0
    ).to(DEVICE)
    
    opt_f = optim.AdamW(ft_model.parameters(), lr=0.0005)
    loader_f = DataLoader(TensorDataset(X_n_tr, X_c_tr, y_tr_f), batch_size=512, shuffle=True)
    
    for _ in range(15): # 시간 단축을 위해 15 에포크
        ft_model.train()
        for bn, bc, by in loader_f:
            opt_f.zero_grad(); crit(ft_model(bn, bc), by).backward(); opt_f.step()
            
    ft_model.eval()
    with torch.no_grad():
        ft_oof[val_idx] = torch.sigmoid(ft_model(X_n_va, X_c_va).squeeze()).cpu().numpy()
        ft_test_preds += torch.sigmoid(ft_model(torch.tensor(ft_test_num_sc, dtype=torch.float32).to(DEVICE), 
                                               torch.tensor(ft_test_cat.values, dtype=torch.long).to(DEVICE)).squeeze()).cpu().numpy() / n_splits
    print(f"   Fold {fold+1} 완료: MLP AUC={roc_auc_score(y.values[val_idx], mlp_oof[val_idx]):.4f}, FT AUC={roc_auc_score(y.values[val_idx], ft_oof[val_idx]):.4f}")



Fold 1/5 학습 시작...
   Fold 1 완료: MLP AUC=0.7688, FT AUC=0.7790

Fold 2/5 학습 시작...
   Fold 2 완료: MLP AUC=0.7621, FT AUC=0.7726

Fold 3/5 학습 시작...
   Fold 3 완료: MLP AUC=0.7556, FT AUC=0.7627

Fold 4/5 학습 시작...
   Fold 4 완료: MLP AUC=0.7534, FT AUC=0.7617

Fold 5/5 학습 시작...
   Fold 5 완료: MLP AUC=0.7593, FT AUC=0.7655


In [3]:
# [4] AutoGluon 앙상블 시작 (Deep Learning Only)
print("\n딥러닝 전용 AutoGluon 앙상블 최적화 시작...")

# 이전에 만든 고성능 OOF 점수를 피처로 사용
train_ag = pd.DataFrame({
    'custom_mlp': mlp_oof,
    'custom_ft': ft_oof,
    'target': y.values
    })
test_ag = pd.DataFrame({
    'custom_mlp': mlp_test_preds, 
    'custom_ft': ft_test_preds
    })

# 딥러닝 계열 모델만 사용하도록 지정
deep_learning_models = {
    'NN_TORCH': {},           # PyTorch 기반 신경망
    'FASTAI': {},             # FastAI 기반 신경망
    'FT_TRANSFORMER': {},     # AutoGluon 내장 FT-Transformer
}

predictor = TabularPredictor(
    label='target', 
    eval_metric='roc_auc'
).fit(
    train_data=train_ag,
    hyperparameters=deep_learning_models, # 머신러닝 트리 모델 제외
    presets='best_quality',               # 스태킹(Stacking) 활성화
    num_gpus=1,                           # M1 Pro GPU 가속 활용 시도
    time_limit=900,                       # 15분 정도 충분히 탐색
    verbosity=2
)

# [5] 앙상블 리더보드에서 최고 AUC 값 가져오기
leaderboard = predictor.leaderboard(silent=True)
best_auc = leaderboard.iloc[0]['score_val'] 
print(f"\nDeep Learning Ensemble Best AUC: {best_auc:.4f}")

# [6] 최종 예측 및 파일 저장 (AUC 포함)
# 최적의 앙상블 가중치 조합으로 예측
final_probs = predictor.predict_proba(test_ag).iloc[:, 1]
sample_sub = pd.read_csv(DATA_PATH + "sample_submission.csv")
sample_sub["voted"] = final_probs

# 파일명에 딥러닝 앙상블임을 명시하고 AUC 포함
file_name = f"sub_DL_Ensemble_MLP_FT_AUC_{best_auc:.4f}.csv"
save_full_path = os.path.join(SUB_PATH, file_name)

# 경로 생성 및 저장
if not os.path.exists(SUB_PATH):
    os.makedirs(SUB_PATH)

sample_sub.to_csv(save_full_path, index=False)
print(f"\n딥러닝 전용 제출 파일이 저장되었습니다: {save_full_path}")

No path specified. Models will be saved in: "AutogluonModels/ag-20260130_040248"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.6.0: Thu Sep 12 23:35:29 PDT 2024; root:xnu-10063.141.1.701.1~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       2.56 GB / 16.00 GB (16.0%)
Disk Space Avail:   369.02 GB / 460.43 GB (80.1%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to 


딥러닝 전용 AutoGluon 앙상블 최적화 시작...


Beginning AutoGluon training ... Time limit = 225s
AutoGluon will save models to "/Users/admin/AI_HC/prj_01/vote-AI-1/notebooks/taehun/AutogluonModels/ag-20260130_040248/ds_sub_fit/sub_fit_ho"
Train Data Rows:    40472
Train Data Columns: 2
Label Column:       target
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2621.37 MB
	Train Data (Original)  Memory Usage: 0.62 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting Dro

KeyboardInterrupt: 