In [17]:
# 라이브러리 로드

import numpy as np
import pandas as pd
from pathlib import Path

# 딥러닝
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# 전처리 & 평가
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import optuna

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

In [4]:
# 데이터 로드
BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'

train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

In [5]:
# 타겟 인코딩
# 1 -> no

TARGET = 'voted'
train_df[TARGET] = train_df[TARGET].map({1: 0, 2: 1})

In [11]:
# 파생변수 생성
# feature 요약

def safe_mean(df, cols):
    '''존재하는 칼럼만 평균'''
    valid = [col for col in cols if col in df.columns]
    if len(valid) == 0:
        return None
    return df[valid].mean(axis=1)

def create_features(df):
    df = df.copy()

    # 태도 / 도덕 요약 점수
    df['cynicism_score'] = safe_mean(df, ['Qc', 'Qh', 'Qj', 'Qm', 'Qo', 'Qs'])
    df['morality_score'] = safe_mean(df, ['Qf', 'Qk', 'Qr'])
    df['kindness_score'] = safe_mean(df, ['Qq'])

    # 응답 극단성 (정치적 확신 proxy)
    q_cols = [col for col in df.columns if col.startswith('Q') and len(col) == 2]
    if len(q_cols) > 0:
        df['extreme_response_ratio'] = (
            ((df[q_cols] == 1) | (df[q_cols] == 5)).sum(axis=1) / len(q_cols)
        )

    # 응답 시간
    time_cols = [col for col in df.columns if col.startswith('Q_E')]
    if len(time_cols) > 0:
        df['mean_response_time'] = df[time_cols].mean(axis=1)
        df['std_response_time'] = df[time_cols].std(axis=1)    

    # 응답 행동 (성실성 / 인지 부하)
    wr_cols = [col for col in df.columns if col.startswith('wr_')]
    wf_cols = [col for col in df.columns if col.startswith('wf_')]

    if wr_cols and wf_cols:
        total = df[wr_cols + wf_cols].sum(axis=1) + 1e-6
        df['real_word_ratio'] = df[wr_cols].sum(axis=1) / total
        df['fake_word_error_ratio'] = df[wf_cols].sum(axis=1) / total

    # 성격 (Big 5 요약)
    df['tp_extraversion']       = safe_mean(df, ['tp01', 'tp06'])
    df['tp_agreeableness']      = safe_mean(df, ['tp02', 'tp07'])
    df['tp_conscientiousness']  = safe_mean(df, ['tp03', 'tp08'])
    df['tp_neuroticism']        = safe_mean(df, ['tp04', 'tp09'])
    df['tp_openness']           = safe_mean(df, ['tp05', 'tp10'])

    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

In [12]:
# 사용할 feature만 선별

FEATURE_CANDIDATES = [
    # Demographic
    'age_group', 'education', 'gender', 'married', 'urban',

    # Personality
    'tp_extraversion', 'tp_agreeableness',
    'tp_conscientiousness', 'tp_neuroticism', 'tp_openness',

    # Cognitive
    'real_word_ratio', 'fake_word_error_ratio',

    # Attitude
    'cynicism_score', 'morality_score',
    'kindness_score', 'extreme_response_ratio',

    # Behavior
    'mean_response_time', 'std_response_time'
]

# 실제 존재하는 칼럼만 선택
FEATURES = [col for col in FEATURE_CANDIDATES if col in train_df.columns]

print(f'사용 feature 수: {len(FEATURES)}')
print('제외된 feature:', set(FEATURE_CANDIDATES) - set(FEATURES))

사용 feature 수: 15
제외된 feature: {'std_response_time', 'extreme_response_ratio', 'mean_response_time'}


In [13]:
# 문자열 & NaN 안정
X = train_df[FEATURES].copy()
X_test = test_df[FEATURES].copy()
y = train_df['voted'].values

# 범주형 처리
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# 칼럼 정렬
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# NaN 처리
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

In [14]:
# 딥러닝 모델 정의

class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout):
        super().__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)

        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)

        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = torch.relu(self.bn1(self.fc1(x)))
        h = self.dropout(h)

        h2 = torch.relu(self.bn2(self.fc2(h)))
        h = h + h2 

        return self.out(h)
    

In [18]:
# Optuna 목적 함수 (AUC 기준)

def objective(trial):
    hidden_dim  = trial.suggest_int('hidden_dim', 128, 512)
    dropout     = trial.suggest_float('dropout', 0.2, 0.5)
    lr          = trial.suggest_float('lr', 1e-4, 3e-3, log=True)
    batch_size  = trial.suggest_categorical('batch_size', [256, 512, 1024])

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for tr_idx, val_idx in skf.split(X_scaled, y):
        X_tr, X_val = X_scaled[tr_idx], X_scaled[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        train_ds = TensorDataset(
            torch.tensor(X_tr, dtype=torch.float32),
            torch.tensor(y_tr, dtype=torch.float32)
        )
        val_ds = TensorDataset(
            torch.tensor(X_val, dtype=torch.float32),
            torch.tensor(y_val, dtype=torch.float32)
        )

        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=1024)

        model = ResidualMLP(X.shape[1], hidden_dim, dropout)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        criterion = nn.BCEWithLogitsLoss()

        # 학습
        for _ in range(20):
            model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                loss = criterion(model(xb).squeeze(), yb)
                loss.backward()
                optimizer.step()

        # 검증
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                preds.extend(torch.sigmoid(model(xb)).numpy())
                trues.extend(yb.numpy())

        aucs.append(roc_auc_score(trues, preds))
    return np.mean(aucs)


In [20]:
# optuna 실행

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print('Best AUC:', study.best_value)
print('Best Params:', study.best_params)

[32m[I 2026-01-30 23:05:28,171][0m A new study created in memory with name: no-name-3be37afd-a533-45a2-927a-339668fe3d95[0m
[32m[I 2026-01-30 23:06:22,585][0m Trial 0 finished with value: 0.7416348176180304 and parameters: {'hidden_dim': 188, 'dropout': 0.21754993477613743, 'lr': 0.0006105302449322708, 'batch_size': 256}. Best is trial 0 with value: 0.7416348176180304.[0m
[32m[I 2026-01-30 23:07:07,053][0m Trial 1 finished with value: 0.7402119725663818 and parameters: {'hidden_dim': 397, 'dropout': 0.236632806629417, 'lr': 0.00033886327152569815, 'batch_size': 1024}. Best is trial 0 with value: 0.7416348176180304.[0m
[32m[I 2026-01-30 23:07:48,695][0m Trial 2 finished with value: 0.7417115485184788 and parameters: {'hidden_dim': 245, 'dropout': 0.20312860398898588, 'lr': 0.00010654099283913636, 'batch_size': 512}. Best is trial 2 with value: 0.7417115485184788.[0m
[32m[I 2026-01-30 23:08:22,271][0m Trial 3 finished with value: 0.7411124200090577 and parameters: {'hidden_

Best AUC: 0.7456019666240052
Best Params: {'hidden_dim': 183, 'dropout': 0.4469243995737826, 'lr': 0.00293543209928893, 'batch_size': 256}


In [22]:
# Best 모델로 전체 학습

best = study.best_params

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

model = ResidualMLP(
    input_dim=X.shape[1],
    hidden_dim=best['hidden_dim'],
    dropout=best['dropout']
)

optimizer = torch.optim.AdamW(model.parameters(), lr=best['lr'])
criterion = nn.BCEWithLogitsLoss()

train_ds = TensorDataset(
    torch.tensor(X_scaled, dtype=torch.float32),
    torch.tensor(y, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=best['batch_size'], shuffle=True)

for _ in range(30):
    for xb, yb in train_loader:
        optimizer.zero_grad()
        loss = criterion(model(xb).squeeze(), yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    test_pred = torch.sigmoid(
        model(torch.tensor(X_test_scaled, dtype=torch.float32))
    ).numpy().ravel()

submission = pd.DataFrame({
    'index': test_df.index,
    'voted': test_pred
})

submission.to_csv('submit_20260130_hana01.csv', index=False)