In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
import os
import gc

# 1. 환경 설정
DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"
SUB_PATH = "./submissions/"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
for p in [OOF_PATH, SUB_PATH]: 
    if not os.path.exists(p): os.makedirs(p)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

# 2. 데이터 로드 및 정밀 청소 (Cleaning)
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test_x.csv')

# [이상치 제거 - Train 전용]
train_df = train_df[train_df['familysize'] <= 50] # 가족 이상치제거
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
train_df = train_df[train_df[a_cols].std(axis=1) > 0] # 불성실 응답자 제거

# [Deep Feature Engineering 함수]
def apply_deep_fe(df):
    e_cols = [col for col in df.columns if col.endswith('E')]
    tp_cols = [f'tp{i:02d}' for i in range(1, 11)]
    reverse_cols = ['QaA', 'QdA', 'QeA', 'QfA', 'QgA', 'QiA', 'QkA', 'QnA', 'QqA', 'QrA', 'QtA']
    
    # (1) 시간 데이터 Winzoring (상위 1% 깎기)
    for col in e_cols:
        limit = df[col].quantile(0.99)
        df[col] = df[col].clip(lower=25, upper=limit)
    
    # (2) 역채점 및 기본 Mach_Score
    for col in a_cols:
        if col in reverse_cols: df[col + '_rev'] = 6 - df[col]
        else: df[col + '_rev'] = df[col]
    rev_a_cols = [col + '_rev' for col in a_cols]
    
    # (3) 심리학적 하위 척도 (Sub-scales)
    # Views (냉소): Qc, Qf, Qg, Qj, Ql, Qq
    views_cols = ['QcA_rev', 'QfA_rev', 'QgA_rev', 'QjA_rev', 'QlA_rev', 'QqA_rev']
    df['Views_Score'] = df[views_cols].mean(axis=1)
    # Tactics (조종): Qa, Qb, Qd, Qe, Qi, Qm, Qn, Qo, Qs, Qt
    tactics_cols = ['QaA_rev', 'QbA_rev', 'QdA_rev', 'QeA_rev', 'QiA_rev', 'QmA_rev', 'QnA_rev', 'QoA_rev', 'QsA_rev', 'QtA_rev']
    df['Tactics_Score'] = df[tactics_cols].mean(axis=1)
    
    # (4) 응답 모순 지표 (Conflict Index)
    # Qq(선하다) vs Qc(믿지마라)의 괴리감 측정
    df['Conflict_Index'] = np.abs(df['QqA_rev'] - df['QcA_rev'])
    
    # (5) 시간 효율성 및 통계
    df['Total_E_Time'] = df[e_cols].sum(axis=1)
    for col in e_cols:
        df[f'{col}_Relative'] = df[col] / (df['Total_E_Time'] + 1e-7)
    
    df['E_Mean'] = df[e_cols].mean(axis=1)
    df['E_Std'] = df[e_cols].std(axis=1)
    df['TP_Mean'] = df[tp_cols].mean(axis=1)
    
    # 불필요 컬럼 드랍
    drop_list = e_cols + ['index', 'hand'] + rev_a_cols + ['Total_E_Time']
    if 'voted' in df.columns: drop_list.append('voted')
    
    # 카테고리 0(무응답) 처리 및 더미화
    cat_cols = ['education', 'urban', 'gender', 'engnat', 'married']
    df[cat_cols] = df[cat_cols].astype(str)
    res = df.drop(drop_list, axis=1)
    return pd.get_dummies(res)

# 실행
train_x = apply_deep_fe(train_df)
test_x = apply_deep_fe(test_df)
test_x = test_x.reindex(columns=train_x.columns, fill_value=0)

# 전처리 (스케일링)
train_y = (2 - train_df['voted'].to_numpy()).astype(np.float32)
train_x_t = torch.tensor(train_x.to_numpy().astype(np.float32))
test_x_t = torch.tensor(test_x.to_numpy().astype(np.float32))

mean = train_x_t.mean(dim=0); std = train_x_t.std(dim=0) + 1e-7
train_x_std = (train_x_t - mean) / std
test_x_std = (test_x_t - mean) / std

# 4. m6 모델 정의 (288-64 구조 채택)
class M6_DeepMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 288), nn.BatchNorm1d(288), nn.LeakyReLU(), nn.Dropout(0.4),
            nn.Linear(288, 64), nn.BatchNorm1d(64), nn.LeakyReLU(), nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x): return self.net(x)

# 5. 학습 프로세스 (N_REPEAT=5, N_SKFOLD=7)
def run_m6_process():
    N_REPEAT, N_SKFOLD, N_EPOCH, BATCH_SIZE = 5, 7, 60, 72
    oof_preds = np.zeros(len(train_x_std))
    test_preds = np.zeros(len(test_x_std))
    
    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=42+r, shuffle=True)
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_std, train_y)):
            t_loader = DataLoader(TensorDataset(train_x_std[t_idx], torch.tensor(train_y[t_idx])), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
            v_loader = DataLoader(TensorDataset(train_x_std[v_idx]), batch_size=BATCH_SIZE, shuffle=False)
            test_loader = DataLoader(TensorDataset(test_x_std), batch_size=BATCH_SIZE, shuffle=False)
            
            model = M6_DeepMLP(train_x_std.shape[1]).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=5e-2)
            criterion = nn.BCEWithLogitsLoss()
            
            best_auc = 0; best_v = None; best_test = None
            for epoch in range(N_EPOCH):
                model.train()
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE)).backward()
                    optimizer.step()
                
                model.eval()
                v_p = []
                with torch.no_grad():
                    for vx in v_loader: v_p.extend(torch.sigmoid(model(vx[0].to(DEVICE)).view(-1)).cpu().numpy())
                
                auc = roc_auc_score(train_y[v_idx], v_p)
                if auc > best_auc:
                    best_auc = auc; best_v = v_p
                    # 테스트 예측
                    t_p = []
                    with torch.no_grad():
                        for tx in test_loader: t_p.extend(torch.sigmoid(model(tx[0].to(DEVICE)).view(-1)).cpu().numpy())
                    best_test = t_p
            
            oof_preds[v_idx] += np.array(best_v) / N_REPEAT
            test_preds += np.array(best_test) / (N_REPEAT * N_SKFOLD)
            print(f"R{r} F{f} | Best AUC: {best_auc:.5f}")
            del model; gc.collect()

    final_auc = roc_auc_score(train_y, oof_preds)
    print(f"\n[m6 결과] 최종 OOF AUC: {final_auc:.5f}")
    
    # 저장
    np.save(f"{OOF_PATH}exp24_m6_DFE_AUC_{final_auc:.5f}.npy", oof_preds)
    sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    sub['voted'] = test_preds
    sub.to_csv(f"{SUB_PATH}m6_DeepFE_test_preds.csv", index=False)

run_m6_process()

R0 F0 | Best AUC: 0.77104
R0 F1 | Best AUC: 0.77603
R0 F2 | Best AUC: 0.76782
R0 F3 | Best AUC: 0.76954
R0 F4 | Best AUC: 0.77329
R0 F5 | Best AUC: 0.76752
R0 F6 | Best AUC: 0.76685
R1 F0 | Best AUC: 0.77210
R1 F1 | Best AUC: 0.76549
R1 F2 | Best AUC: 0.76986
R1 F3 | Best AUC: 0.77528
R1 F4 | Best AUC: 0.76782
R1 F5 | Best AUC: 0.77179
R1 F6 | Best AUC: 0.76736
R2 F0 | Best AUC: 0.77167
R2 F1 | Best AUC: 0.76841
R2 F2 | Best AUC: 0.76706
R2 F3 | Best AUC: 0.75898
R2 F4 | Best AUC: 0.77405
R2 F5 | Best AUC: 0.77495
R2 F6 | Best AUC: 0.77072
R3 F0 | Best AUC: 0.76412
R3 F1 | Best AUC: 0.77162
R3 F2 | Best AUC: 0.76796
R3 F3 | Best AUC: 0.77519
R3 F4 | Best AUC: 0.77166
R3 F5 | Best AUC: 0.77191
R3 F6 | Best AUC: 0.76963
R4 F0 | Best AUC: 0.77113
R4 F1 | Best AUC: 0.76769
R4 F2 | Best AUC: 0.76711
R4 F3 | Best AUC: 0.76732
R4 F4 | Best AUC: 0.77756
R4 F5 | Best AUC: 0.76647
R4 F6 | Best AUC: 0.77230

[m6 결과] 최종 OOF AUC: 0.77275
