In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

# --- [1] 경로 및 환경 설정 ---
DATA_PATH = '../../data/raw/'
OOF_PATH = './oof_data/'
SUB_PATH = './submissions/'

os.makedirs(OOF_PATH, exist_ok=True)
os.makedirs(SUB_PATH, exist_ok=True)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()
DEVICE = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

# --- [2] m7의 맛을 살린 정밀 Feature Preparation ---
def get_m7_final_features():
    train = pd.read_csv(f'{DATA_PATH}train.csv')
    test = pd.read_csv(f'{DATA_PATH}test_x.csv')
    a_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
    
    # m7 박제 필터링
    train = train[(train.familysize <= 50) & (train[a_cols].std(axis=1) > 0)].reset_index(drop=True)
    y = (2 - train['voted'].to_numpy()).astype(np.float32)
    
    combined = pd.concat([train.drop('voted', axis=1), test], axis=0).reset_index(drop=True)
    
    # 1. m7 핵심 피처 (Raw)
    combined['Q_Var'] = combined[a_cols].var(axis=1)
    combined['mach_score'] = combined[a_cols].mean(axis=1)
    
    # 2. m1에서 딱 하나 '나이'만 가져옴
    combined['age_numeric'] = combined['age_group'].str.extract('(\d+)').astype(float).fillna(30)
    
    # [사고의 결과] Q_A(1~5)와 체급을 맞추기 위해 큰 수치들만 MinMax 가공
    # StandardScaler 대신 정보 왜곡이 적은 방식으로 나이와 가족 수만 조정
    big_cols = ['familysize', 'age_numeric', 'Q_Var', 'mach_score']
    for col in big_cols:
        c_min, c_max = combined[col].min(), combined[col].max()
        combined[col] = (combined[col] - c_min) / (c_max - c_min) * 5 # 0~5 범위로 맞춰 Q_A와 동급으로 만듦
    
    features = a_cols + big_cols
    return combined.iloc[:len(train)][features].values, y, combined.iloc[len(train):][features].values

# --- [3] m7 박제 아키텍처 (256-128-64 ReLU) ---
class M7FinalNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

# --- [4] Training Loop ---
X, y, X_test = get_m7_final_features()
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
oof = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

X_test_tensor = torch.FloatTensor(X_test).to(DEVICE)

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    X_t, X_v = torch.FloatTensor(X[t_idx]).to(DEVICE), torch.FloatTensor(X[v_idx]).to(DEVICE)
    y_t, y_v = torch.FloatTensor(y[t_idx]).view(-1, 1).to(DEVICE), y[v_idx]
    
    model = M7FinalNet(X.shape[1]).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()
    
    best_fold_auc = 0
    pbar = tqdm(range(80), desc=f'Fold {fold+1}')
    
    for epoch in pbar:
        model.train()
        optimizer.zero_grad()
        loss = criterion(model(X_t), y_t)
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            v_probs = torch.sigmoid(model(X_v)).squeeze().cpu().numpy()
            auc = roc_auc_score(y_v, v_probs)
            if auc > best_fold_auc:
                best_fold_auc = auc
                oof[v_idx] = v_probs
                test_probs = torch.sigmoid(model(X_test_tensor)).squeeze().cpu().numpy()
        
        pbar.set_postfix(Loss=f'{loss.item():.4f}', AUC=f'{auc:.4f}', Best=f'{best_fold_auc:.4f}')
    
    test_preds += test_probs / 7
    print(f'Fold {fold+1} Best: {best_fold_auc:.5f}')

# --- [5] 저장 ---
total_auc = roc_auc_score(y, oof)
print(f'\n[Final Result] m7_final OOF AUC: {total_auc:.5f}')

np.save(f'{OOF_PATH}exp35_m7_final_AUC_{total_auc:.5f}.npy', oof)
sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sub['voted'] = 2.0 - pd.Series(test_preds).rank(pct=True).values
sub.to_csv(f'{SUB_PATH}exp35_m7_final_sub_{total_auc:.5f}.csv', index=False)

Fold 1:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 1 Best: 0.72558


Fold 2:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 2 Best: 0.72708


Fold 3:   0%|          | 0/80 [00:00<?, ?it/s]

KeyboardInterrupt: 