In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
import os
import gc

# 1. 환경 설정
DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"
SUB_PATH = "./submissions/"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
for p in [OOF_PATH, SUB_PATH]: 
    if not os.path.exists(p): os.makedirs(p)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

# 2. 데이터 로드 및 청소
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test_x.csv')

# 이상치 제거
train_df = train_df[train_df['familysize'] <= 50]
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
train_df = train_df[train_df[a_cols].std(axis=1) > 0]

# [Exp 25: Refined Feature Engineering]
def apply_refined_fe(df):
    e_cols = [col for col in df.columns if col.endswith('E')]
    tp_cols = [f'tp{i:02d}' for i in range(1, 11)]
    reverse_cols = ['QaA', 'QdA', 'QeA', 'QfA', 'QgA', 'QiA', 'QkA', 'QnA', 'QqA', 'QrA', 'QtA']
    
    # (1) 시간 데이터 Winzoring (상위 1% 깎기)
    for col in e_cols:
        limit = df[col].quantile(0.99)
        df[col] = df[col].clip(lower=25, upper=limit)
    
    # (2) 역채점 및 핵심 심리 지표
    for col in a_cols:
        if col in reverse_cols: df[col + '_rev'] = 6 - df[col]
        else: df[col + '_rev'] = df[col]
    rev_a_cols = [col + '_rev' for col in a_cols]
    
    # [F1] 심리학적 하위 척도 (검증된 것만 유지)
    views_cols = ['QcA_rev', 'QfA_rev', 'QgA_rev', 'QjA_rev', 'QlA_rev', 'QqA_rev']
    df['Views_Score'] = df[views_cols].mean(axis=1)
    tactics_cols = ['QaA_rev', 'QbA_rev', 'QdA_rev', 'QeA_rev', 'QiA_rev', 'QmA_rev', 'QnA_rev', 'QoA_rev', 'QsA_rev', 'QtA_rev']
    df['Tactics_Score'] = df[tactics_cols].mean(axis=1)
    
    # [F2] 응답 일관성/분산 (신규 추가)
    df['Q_Var'] = df[a_cols].var(axis=1) # 얼마나 극단적으로 혹은 평이하게 답했는가
    df['Conflict_Index'] = np.abs(df['QqA_rev'] - df['QcA_rev']) # 모순 지표
    
    # [F3] 통계 지표 (간소화)
    df['E_Mean'] = df[e_cols].mean(axis=1)
    df['E_Std'] = df[e_cols].std(axis=1)
    df['TP_Mean'] = df[tp_cols].mean(axis=1)
    
    # 불필요 컬럼 드랍 (20개 Relative E-time은 삭제됨)
    drop_list = e_cols + ['index', 'hand'] + rev_a_cols
    if 'voted' in df.columns: drop_list.append('voted')
    
    cat_cols = ['education', 'urban', 'gender', 'engnat', 'married']
    df[cat_cols] = df[cat_cols].astype(str)
    res = df.drop(drop_list, axis=1)
    return pd.get_dummies(res)

train_x = apply_refined_fe(train_df)
test_x = apply_refined_fe(test_df)
test_x = test_x.reindex(columns=train_x.columns, fill_value=0)

# 전처리
train_y = (2 - train_df['voted'].to_numpy()).astype(np.float32)
train_x_t = torch.tensor(train_x.to_numpy().astype(np.float32))
test_x_t = torch.tensor(test_x.to_numpy().astype(np.float32))

mean = train_x_t.mean(dim=0); std = train_x_t.std(dim=0) + 1e-7
train_x_std = (train_x_t - mean) / std
test_x_std = (test_x_t - mean) / std

# 4. m7 모델 정의 (안정적인 256-32 구조)
class M7_RefinedMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.LeakyReLU(), nn.Dropout(0.4),
            nn.Linear(256, 32), nn.BatchNorm1d(32), nn.LeakyReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.net(x)

# 5. 학습 및 추론 (N_REPEAT=5, N_SKFOLD=7)
def run_m7_process():
    N_REPEAT, N_SKFOLD, N_EPOCH, BATCH_SIZE = 5, 7, 50, 72
    oof_preds = np.zeros(len(train_x_std))
    test_preds = np.zeros(len(test_x_std))
    
    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=100+r, shuffle=True)
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_std, train_y)):
            t_loader = DataLoader(TensorDataset(train_x_std[t_idx], torch.tensor(train_y[t_idx])), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
            v_loader = DataLoader(TensorDataset(train_x_std[v_idx]), batch_size=BATCH_SIZE, shuffle=False)
            test_loader = DataLoader(TensorDataset(test_x_std), batch_size=BATCH_SIZE, shuffle=False)
            
            model = M7_RefinedMLP(train_x_std.shape[1]).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=3e-2)
            criterion = nn.BCEWithLogitsLoss()
            
            best_auc = 0; best_v = None; best_test = None
            for epoch in range(N_EPOCH):
                model.train()
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE)).backward()
                    optimizer.step()
                
                model.eval()
                v_p = []
                with torch.no_grad():
                    for vx in v_loader: v_p.extend(torch.sigmoid(model(vx[0].to(DEVICE)).view(-1)).cpu().numpy())
                
                auc = roc_auc_score(train_y[v_idx], v_p)
                if auc > best_auc:
                    best_auc = auc; best_v = v_p
                    t_p = []
                    with torch.no_grad():
                        for tx in test_loader: t_p.extend(torch.sigmoid(model(tx[0].to(DEVICE)).view(-1)).cpu().numpy())
                    best_test = t_p
            
            oof_preds[v_idx] += np.array(best_v) / N_REPEAT
            test_preds += np.array(best_test) / (N_REPEAT * N_SKFOLD)
            print(f"R{r} F{f} | Best AUC: {best_auc:.5f}")
            del model; gc.collect()

    final_auc = roc_auc_score(train_y, oof_preds)
    print(f"\n[m7 결과] 최종 OOF AUC: {final_auc:.5f}")
    
    np.save(f"{OOF_PATH}exp25_m7_Refined_AUC_{final_auc:.5f}.npy", oof_preds)
    sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    sub['voted'] = test_preds
    sub.to_csv(f"{SUB_PATH}m7_Refined_test_preds.csv", index=False)

run_m7_process()

R0 F0 | Best AUC: 0.76605
R0 F1 | Best AUC: 0.77416
R0 F2 | Best AUC: 0.77510
R0 F3 | Best AUC: 0.77212
R0 F4 | Best AUC: 0.77337
R0 F5 | Best AUC: 0.76896
R0 F6 | Best AUC: 0.77132
R1 F0 | Best AUC: 0.77273
R1 F1 | Best AUC: 0.77010
R1 F2 | Best AUC: 0.76857
R1 F3 | Best AUC: 0.77010
R1 F4 | Best AUC: 0.77563
R1 F5 | Best AUC: 0.77101
R1 F6 | Best AUC: 0.76976
R2 F0 | Best AUC: 0.76924
R2 F1 | Best AUC: 0.76289
R2 F2 | Best AUC: 0.76810
R2 F3 | Best AUC: 0.77868
R2 F4 | Best AUC: 0.77330
R2 F5 | Best AUC: 0.77529
R2 F6 | Best AUC: 0.77354
R3 F0 | Best AUC: 0.77465
R3 F1 | Best AUC: 0.77144
R3 F2 | Best AUC: 0.77693
R3 F3 | Best AUC: 0.76254
R3 F4 | Best AUC: 0.76899
R3 F5 | Best AUC: 0.76475
R3 F6 | Best AUC: 0.77881
R4 F0 | Best AUC: 0.76325
R4 F1 | Best AUC: 0.77562
R4 F2 | Best AUC: 0.77002
R4 F3 | Best AUC: 0.77086
R4 F4 | Best AUC: 0.77963
R4 F5 | Best AUC: 0.76409
R4 F6 | Best AUC: 0.77575

[m7 결과] 최종 OOF AUC: 0.77375
