In [1]:
# 라이브러리 로드
import numpy as np
import pandas as pd
from pathlib import Path

# 전처리
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold

# 딥러닝
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# 하이퍼파라미터 탐색
import optuna

# 재현성 확보
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터 로드
BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'

train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

# 타겟 인코딩
# 투표 안 함 = 1
train_df['voted'] = train_df['voted'].map({1: 0, 2: 1})

In [4]:
# 파생변수 생성
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    ''' 
    설문 응답 데이터를 
    '정치적 태도 / 성향 / 응답 행동' 관점에서 요약한 파생변수 생성

    존재하는 칼럼만 사용
    '''
    df = df.copy()

    # 태도 성향 요약 점수
    cynicism_items = ['Qc', 'Qh', 'Qj', 'Qm', 'Qo', 'Qs']
    morality_items = ['Qf', 'Qk', 'Qr']
    kindness_items = ['Qq']

    cyn_cols = [col for col in cynicism_items if col in df.columns]
    mor_cols = [col for col in morality_items if col in df.columns]
    kin_cols = [col for col in kindness_items if col in df.columns]

    if len(cyn_cols) > 0:
        df['cynicism_score'] = df[cyn_cols].mean(axis=1)

    if len(mor_cols) > 0:
        df['morality_score'] = df[mor_cols].mean(axis=1)
    
    if len(kin_cols) > 0:
        df['kindness_score'] = df[kin_cols].mean(axis=1)

    # 응답 극단성
    q_cols = [col for col in df.columns if col.startswith('Q') and len(col) == 2]
    if len(q_cols) > 0:
        df['extreme_response_ratio'] = (
            ((df[q_cols] == 1) | (df[q_cols] == 5)).sum(axis=1)
            / len(q_cols)
        )

    # 응답 시간 행동 특징
    time_cols = [col for col in df.columns if col.startswith('Q_E')]
    if len(time_cols) > 0:
        df['mean_response_time'] = df[time_cols].mean(axis=1)
        df['std_response_time'] = df[time_cols].std(axis=1)

    # 언어/인지 수준 proxy
    wr_cols = [col for col in df.columns if col.startswith('wr_')]
    wf_cols = [col for col in df.columns if col.startswith('wf_')]

    if len(wr_cols) > 0 and len(wf_cols) > 0:
        df['real_word_ratio'] = (
            df[wr_cols].sum(axis=1)
            / (df[wr_cols + wf_cols].sum(axis=1) + 1e-6)
        )

    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

In [5]:
# feature / target 분리
TARGET = 'voted'
DROP_COLS = ['index', TARGET]

X_df = train_df.drop(columns=DROP_COLS, errors='ignore')
y = train_df[TARGET].values

X_test_df = test_df.drop(columns=['index'], errors='ignore')

# 문자열 칼럼 제거
X_df = X_df.select_dtypes(include=[np.number])
X_test_df = X_test_df.select_dtypes(include=[np.number])

In [6]:
# 결측치 처리 + 분산 0 제거

# 결측치 -> 중앙값
imputer = SimpleImputer(strategy='median')
X_df = pd.DataFrame(imputer.fit_transform(X_df), columns=X_df.columns)
X_test_df = pd.DataFrame(imputer.transform(X_test_df), columns=X_df.columns)

# 분산 0 제거
selector = VarianceThreshold(threshold=0.0)
X = selector.fit_transform(X_df)
X_test = selector.transform(X_test_df)

In [7]:
# 표준화
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [11]:
# Residual MLP 정의

class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.bn1 = nn.BatchNorm1d(dim)
        self.fc2 = nn.Linear(dim, dim)
        self.bn2 = nn.BatchNorm1d(dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        residual = x
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.bn2(self.fc2(x))
        return torch.relu(x + residual)
    
class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, hidden_dim)
        self.act = nn.ReLU()

        self.block1 = ResidualBlock(hidden_dim, dropout)
        self.block2 = ResidualBlock(hidden_dim, dropout)

        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.act(self.input_layer(x))
        x = self.block1(x)
        x = self.block2(x)
        return self.output_layer(x)

In [12]:
# Optuna Objective (5-Fold CV)
def objective(trial):

    hidden_dim = trial.suggest_int('hidden_dim', 128, 512)
    dropout = trial.suggest_float('dropout', 0.2, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 3e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [256, 512, 1024])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in skf.split(X, y):

        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        train_ds = TensorDataset(
            torch.tensor(X_tr, dtype=torch.float32),        
            torch.tensor(y_tr, dtype=torch.float32)
        )
        val_ds = TensorDataset(
            torch.tensor(X_val, dtype=torch.float32),
            torch.tensor(y_val, dtype=torch.float32)
        )

        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=1024)

        model = ResidualMLP(X.shape[1], hidden_dim, dropout)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        criterion = nn.BCEWithLogitsLoss()

        # 학습
        for _ in range(15):
            model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                loss = criterion(model(xb).squeeze(), yb)
                loss.backward()
                optimizer.step()

        # 검증
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                preds.extend(torch.sigmoid(model(xb)).numpy())
                trues.extend(yb.numpy())

        aucs.append(roc_auc_score(trues, preds))

    return np.mean(aucs)

In [13]:
# optuna 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print('Best AUC:', study.best_value)
print('Best Params:', study.best_params)

[32m[I 2026-01-30 12:25:11,406][0m A new study created in memory with name: no-name-e08fc17c-212d-4999-bbc7-1616f71dfb4a[0m
[32m[I 2026-01-30 12:27:13,774][0m Trial 0 finished with value: 0.7257196942392881 and parameters: {'hidden_dim': 452, 'dropout': 0.4756547756409684, 'lr': 0.00021119904112992262, 'batch_size': 256}. Best is trial 0 with value: 0.7257196942392881.[0m
[32m[I 2026-01-30 12:28:05,769][0m Trial 1 finished with value: 0.7266321280368462 and parameters: {'hidden_dim': 287, 'dropout': 0.2960257696984282, 'lr': 0.00013973145910537495, 'batch_size': 1024}. Best is trial 1 with value: 0.7266321280368462.[0m
[32m[I 2026-01-30 12:29:30,433][0m Trial 2 finished with value: 0.7143249607927983 and parameters: {'hidden_dim': 454, 'dropout': 0.38655120765088646, 'lr': 0.0003868546559017641, 'batch_size': 512}. Best is trial 1 with value: 0.7266321280368462.[0m
[32m[I 2026-01-30 12:30:13,477][0m Trial 3 finished with value: 0.725822037360365 and parameters: {'hidden_d

Best AUC: 0.7316705993059268
Best Params: {'hidden_dim': 317, 'dropout': 0.4820764271022981, 'lr': 0.0003444219732361436, 'batch_size': 1024}


In [16]:
# 최종 모델 학습 + 제출 생성
best = study.best_params

model = ResidualMLP(
    X.shape[1],
    best['hidden_dim'],
    best['dropout']
)

optimizer = torch.optim.AdamW(model.parameters(), lr=best['lr'])
criterion = nn.BCEWithLogitsLoss()

train_ds = TensorDataset(
    torch.tensor(X, dtype=torch.float32),
    torch.tensor(y, dtype=torch.float32)
)
train_loader = DataLoader(train_ds, batch_size=best['batch_size'], shuffle=True)

for _ in range(30):
    for xb, yb in train_loader:
        optimizer.zero_grad()
        loss = criterion(model(xb).squeeze(), yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    test_pred = torch.sigmoid(
        model(torch.tensor(X_test, dtype=torch.float32))
    ).numpy().ravel()

submission = pd.DataFrame({
    'index': test_df.index,
    'voted': test_pred
})

submission.to_csv('submit_20260130_hana00.csv', index=False)