In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm  # tqdm 적용
from datetime import datetime
import gc
import os

# 1. 경로 및 환경 설정
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

if not os.path.exists(SUB_PATH):
    os.makedirs(SUB_PATH)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

# 2. 데이터 전처리
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = (2 - train_data['voted'].to_numpy()).astype(np.float32)
train_x = train_data.drop(drop_list + ['voted'], axis=1).astype(replace_dict)
test_x = test_data.drop(drop_list, axis=1).astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_x_t = torch.tensor(train_x.to_numpy().astype(np.float32), dtype=torch.float32)
train_y_t = torch.tensor(train_y, dtype=torch.float32)
test_x_t = torch.tensor(test_x.to_numpy().astype(np.float32), dtype=torch.float32)

# 수동 스케일링
train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2.
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5

# 3. Focal Loss & SNN 모델 정의
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.3, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt)**self.gamma * BCE_loss
        return torch.mean(F_loss)

class SNNModel(nn.Module):
    def __init__(self, input_dim=91):
        super(SNNModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.SELU(),
            nn.AlphaDropout(0.1),
            nn.Linear(256, 128),
            nn.SELU(),
            nn.AlphaDropout(0.05),
            nn.Linear(128, 64),
            nn.SELU(),
            nn.Linear(64, 1)
        )
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear')

    def forward(self, x):
        return self.net(x)

# 4. 학습 실행 함수
def train_snn_focal_with_tqdm():
    N_REPEAT = 5
    N_SKFOLD = 7
    final_preds = np.zeros((len(test_x_t), 1))
    total_val_aucs = []

    total_iterations = N_REPEAT * N_SKFOLD
    overall_pbar = tqdm(total=total_iterations, desc="Overall Progress")

    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=r+42, shuffle=True)
        
        for f, (t_idx, v_idx) in enumerate(skf.split(train_x_t, train_y_t)):
            t_loader = DataLoader(TensorDataset(train_x_t[t_idx], train_y_t[t_idx]), batch_size=128, shuffle=True)
            v_loader = DataLoader(TensorDataset(train_x_t[v_idx], train_y_t[v_idx]), batch_size=128, shuffle=False)
            
            model = SNNModel(91).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
            criterion = FocalLoss(alpha=0.3, gamma=2.0)
            
            best_auc = 0
            best_p = None
            
            # 에포크 진행상황 표시 (내부 tqdm)
            epoch_pbar = tqdm(range(60), desc=f"R{r+1} F{f+1}", leave=False)
            for epoch in epoch_pbar:
                model.train()
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    loss = criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE))
                    loss.backward()
                    optimizer.step()
                
                model.eval()
                v_preds = []
                with torch.no_grad():
                    for vx, _ in v_loader:
                        v_preds.extend(torch.sigmoid(model(vx.to(DEVICE)).view(-1)).cpu().numpy())
                
                auc = roc_auc_score(train_y_t[v_idx].numpy(), v_preds)
                if auc > best_auc:
                    best_auc = auc
                    with torch.no_grad():
                        best_p = (2. - torch.sigmoid(model(test_x_t.to(DEVICE)).view(-1, 1))).cpu().numpy()
                
                epoch_pbar.set_postfix({'AUC': f'{auc:.4f}', 'Best': f'{best_auc:.4f}'})
            
            total_val_aucs.append(best_auc)
            final_preds += best_p / (N_REPEAT * N_SKFOLD)
            
            overall_pbar.update(1)
            overall_pbar.set_postfix({'Mean_AUC': f'{np.mean(total_val_aucs):.5f}'})
            
            del model; gc.collect()
            
    overall_pbar.close()
    return final_preds, np.mean(total_val_aucs)

# 실행 및 결과 저장
print(f">>> Diversity Model Training Start (Focal Loss + SNN)")
snn_preds, mean_auc = train_snn_focal_with_tqdm()

print(f"\n[Final Results] Mean Validation AUC: {mean_auc:.5f}")

# 파일 저장
filename = f"{SUB_PATH}Diversity_Focal_SNN_AUC_{mean_auc:.4f}.csv"
sub = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
sub['voted'] = snn_preds
sub.to_csv(filename, index=False)

# 0.779 이상일 경우 즉시 앙상블 수행
if mean_auc >= 0.779:
    print(f"조건 충족(>=0.779). 즉시 앙상블을 수행합니다.")
    try:
        # 최고득점 0.78116 (0.7) + Focal Loss + SNN (0.3)
        m1_df = pd.read_csv(f"{SUB_PATH}0130-1923.csv")
        ensemble_voted = (m1_df['voted'] * 0.7) + (sub['voted'] * 0.3)
        
        final_sub = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
        final_sub['voted'] = ensemble_voted
        ensemble_name = f"{SUB_PATH}Diversity_Ensemble_73.csv"
        final_sub.to_csv(ensemble_name, index=False)
        print(f"앙상블 완료: {ensemble_name}")
    except Exception as e:
        print(f"앙상블 실패 (파일 확인 필요): {e}")
else:
    print(f"기준 미달로 앙상블을 중단합니다. (현재 AUC: {mean_auc:.5f})")

>>> Diversity Model Training Start (Focal Loss + SNN)


Overall Progress:   0%|          | 0/35 [00:00<?, ?it/s]

R1 F1:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F2:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F3:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F4:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F5:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F6:   0%|          | 0/60 [00:00<?, ?it/s]

R1 F7:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F1:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F2:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F3:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F4:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F5:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F6:   0%|          | 0/60 [00:00<?, ?it/s]

R2 F7:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F1:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F2:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F3:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F4:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F5:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F6:   0%|          | 0/60 [00:00<?, ?it/s]

R3 F7:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F1:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F2:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F3:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F4:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F5:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F6:   0%|          | 0/60 [00:00<?, ?it/s]

R4 F7:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F1:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F2:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F3:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F4:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F5:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F6:   0%|          | 0/60 [00:00<?, ?it/s]

R5 F7:   0%|          | 0/60 [00:00<?, ?it/s]


[Final Results] Mean Validation AUC: 0.77077
기준 미달로 앙상블을 중단합니다. (현재 AUC: 0.77077)


In [2]:
# 0.78116 모델과 지금 만든 0.77077 모델의 상관관계 확인
m1 = pd.read_csv(f"{SUB_PATH}0130-1923.csv")['voted']
m_snn = pd.read_csv(f"{SUB_PATH}Diversity_Focal_SNN_AUC_0.7708.csv")['voted']

print(f"상관계수: {m1.corr(m_snn):.4f}")

상관계수: 0.9751


In [4]:
# 1. 파일 로드
m1 = pd.read_csv(f"{SUB_PATH}0130-1923.csv")['voted']
m_snn = pd.read_csv(f"{SUB_PATH}Diversity_Focal_SNN_AUC_0.7708.csv")['voted']

# 2. 초보수적 가중치 (9:1 전략)
# SNN 모델이 '어려운 샘플'에서 낸 의견만 살짝 반영합니다.
ensemble_voted = (m1 * 0.92) + (m_snn * 0.08)

# 3. 결과 저장
final_sub = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
final_sub['voted'] = ensemble_voted
final_name = f"{SUB_PATH}Experimental_Blend_92_08.csv"
final_sub.to_csv(final_name, index=False)

print(f"앙상블 완료: {final_name}")

앙상블 완료: ./submissions/Experimental_Blend_92_08.csv
