In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
import os
import gc

# 1. 환경 설정 및 경로 (사용자 스타일 준수)
DATA_PATH = '../../data/raw/'
OOF_PATH = './oof_data/'
SUB_PATH = './submissions/'
DEVICE = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

# Exp 26: 현재 가장 신뢰도 높은 SOTA 결과 (Pseudo Label 생성을 위한 교사 모델)
BEST_SUB_FILE = f'{SUB_PATH}26_m1_m7_Rank_Ensemble_0.77364.csv'

if not os.path.exists(OOF_PATH): os.makedirs(OOF_PATH)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

# 2. 데이터 로드 및 FE 함수 정의
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test_x.csv')
best_sub_df = pd.read_csv(BEST_SUB_FILE)

# 이상치 및 불성실 응답 제거 (m7 기준)
train_df = train_df[train_df['familysize'] <= 50]
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
train_df = train_df[train_df[a_cols].std(axis=1) > 0]

def apply_hybrid_fe(df):
    e_cols = [col for col in df.columns if col.endswith('E')]
    # 시간 데이터 Winzorization (상위 1%)
    for col in e_cols:
        limit = df[col].quantile(0.99)
        df[col] = df[col].clip(lower=25, upper=limit)
    
    # [m7 기존 피처]
    df['Views_Score'] = df[['QcA', 'QfA', 'QgA', 'QjA', 'QlA', 'QqA']].mean(axis=1)
    df['Tactics_Score'] = df[['QaA', 'QbA', 'QdA', 'QeA', 'QiA', 'QmA', 'QnA', 'QoA', 'QsA', 'QtA']].mean(axis=1)
    df['Q_Var'] = df[a_cols].var(axis=1)
    df['Conflict_Index'] = np.abs(df['QqA'] - df['QcA'])
    
    cat_cols = ['education', 'urban', 'gender', 'engnat', 'married']
    df[cat_cols] = df[cat_cols].astype(str)
    res = df.drop(e_cols + ['index', 'hand'], axis=1, errors='ignore')
    return pd.get_dummies(res.drop('voted', axis=1, errors='ignore'))

# 3. Pseudo-Labeling 전략 (5% 초고신뢰 데이터만 선별)
n_pseudo = int(len(test_df) * 0.05) # 10% -> 5%로 강화
top_voted_idx = best_sub_df.nsmallest(n_pseudo, 'voted').index
top_not_voted_idx = best_sub_df.nlargest(n_pseudo, 'voted').index

pseudo_train_x = test_df.iloc[np.concatenate([top_voted_idx, top_not_voted_idx])].copy()
pseudo_train_x['voted'] = [1] * n_pseudo + [2] * n_pseudo

# 원본 Train 데이터의 마지막 인덱스 기록 (나중에 순수 OOF AUC 계산용)
original_train_len = len(train_df)

combined_train_df = pd.concat([train_df, pseudo_train_x], axis=0).reset_index(drop=True)
train_x = apply_hybrid_fe(combined_train_df)
test_x = apply_hybrid_fe(test_df)
test_x = test_x.reindex(columns=train_x.columns, fill_value=0)

train_y = (2 - combined_train_df['voted'].to_numpy()).astype(np.float32)
train_x_t = torch.tensor(train_x.to_numpy().astype(np.float32))
test_x_t = torch.tensor(test_x.to_numpy().astype(np.float32))

mean = train_x_t.mean(dim=0); std = train_x_t.std(dim=0) + 1e-7
train_x_std = (train_x_t - mean) / std
test_x_std = (test_x_t - mean) / std

# 4. 모델 정의 (m7 아키텍처 계승)
class M7_HybridMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.LeakyReLU(), nn.Dropout(0.4),
            nn.Linear(256, 32), nn.BatchNorm1d(32), nn.LeakyReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.net(x)

# 5. 학습 프로세스 실행 (5 Repeats, 7 Folds)
def run_exp40_process():
    N_REPEAT, N_SKFOLD, N_EPOCH, BATCH_SIZE = 5, 7, 50, 72
    oof_preds = np.zeros(len(train_x_std))
    test_preds = np.zeros(len(test_x_std))
    
    overall_pbar = tqdm(range(N_REPEAT), desc="Exp 40 Total Progress")
    
    for r in overall_pbar:
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=r+2026, shuffle=True)
        repeat_oof = np.zeros(len(train_x_std))
        
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_std, train_y)):
            t_loader = DataLoader(TensorDataset(train_x_std[t_idx], torch.tensor(train_y[t_idx])), 
                                  batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
            v_loader = DataLoader(TensorDataset(train_x_std[v_idx], torch.tensor(train_y[v_idx])), 
                                  batch_size=BATCH_SIZE, shuffle=False)
            
            model = M7_HybridMLP(train_x_std.shape[1]).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=3e-2)
            criterion = nn.BCEWithLogitsLoss()
            
            best_auc, best_v, best_t = 0, None, None
            epoch_pbar = tqdm(range(N_EPOCH), desc=f"R{r} F{f}", leave=False)
            
            for epoch in epoch_pbar:
                model.train()
                train_loss = 0
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    loss = criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE))
                    loss.backward(); optimizer.step()
                    train_loss += loss.item()
                
                model.eval()
                v_p, v_true, v_loss = [], [], 0
                with torch.no_grad():
                    for vx, vy in v_loader:
                        out = model(vx.to(DEVICE)).view(-1)
                        v_loss += criterion(out, vy.to(DEVICE)).item()
                        v_p.extend(torch.sigmoid(out).cpu().numpy())
                        v_true.extend(vy.numpy())
                
                cur_auc = roc_auc_score(v_true, v_p)
                epoch_pbar.set_postfix({'val_loss': f'{v_loss/len(v_loader):.4f}', 'auc': f'{cur_auc:.4f}'})
                
                if cur_auc > best_auc:
                    best_auc = cur_auc
                    best_v = v_p
                    t_p = []
                    with torch.no_grad():
                        for tx in DataLoader(TensorDataset(test_x_std), batch_size=BATCH_SIZE):
                            t_p.extend(torch.sigmoid(model(tx[0].to(DEVICE)).view(-1)).cpu().numpy())
                    best_t = t_p
            
            repeat_oof[v_idx] = best_v
            test_preds += np.array(best_t) / (N_REPEAT * N_SKFOLD)
            del model; gc.collect()
            
        oof_preds += repeat_oof / N_REPEAT
        # 원본 데이터 구간에 대해서만 중간 AUC 출력
        current_pure_auc = roc_auc_score(train_y[:original_train_len], oof_preds[:original_train_len] * (N_REPEAT/(r+1)))
        overall_pbar.set_postfix({'Pure_OOF_AUC': f'{current_pure_auc:.5f}'})

    # 6. 결과 저장
    # 가짜 정답을 제외한 원본 데이터에 대한 최종 OOF AUC 계산
    final_pure_auc = roc_auc_score(train_y[:original_train_len], oof_preds[:original_train_len])
    print(f"\n[Final Results] Pure OOF AUC (Original Data Only): {final_pure_auc:.5f}")
    
    np.save(f"{OOF_PATH}exp40_pure_pl_5pct_AUC_{final_pure_auc:.5f}.npy", oof_preds)
    
    sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    sub['voted'] = 2.0 - pd.Series(test_preds).rank(pct=True).values
    sub.to_csv(f"{SUB_PATH}40_Pure_PL_5pct_AUC_{final_pure_auc:.5f}.csv", index=False)
    print(f"Exp 40 Finished. Files saved.")

run_exp40_process()

Exp 40 Total Progress:   0%|          | 0/5 [00:00<?, ?it/s]

R0 F0:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F1:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F2:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F3:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F4:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F5:   0%|          | 0/50 [00:00<?, ?it/s]

R0 F6:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F0:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F1:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F2:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F3:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F4:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F5:   0%|          | 0/50 [00:00<?, ?it/s]

R1 F6:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F0:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F1:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F2:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F3:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F4:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F5:   0%|          | 0/50 [00:00<?, ?it/s]

R2 F6:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F0:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F1:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F2:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F3:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F4:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F5:   0%|          | 0/50 [00:00<?, ?it/s]

R3 F6:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F0:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F1:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F2:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F3:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F4:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F5:   0%|          | 0/50 [00:00<?, ?it/s]

R4 F6:   0%|          | 0/50 [00:00<?, ?it/s]


[Final Results] Pure OOF AUC (Original Data Only): 0.77376
Exp 40 Finished. Files saved.
