In [4]:
# 딥러닝 + 머신러닝 모델로 시도...
# 라이브러리 불러오기
import numpy as np
import pandas as pd
from pathlib import Path

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# 데이터 경로
BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'

In [31]:
# 데이터 불러오기
train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

# 타겟 설정 인코딩
TARGET = 'voted'
train_df[TARGET] = train_df['voted'].map({1: 0, 2: 1})

In [32]:
# 핵심 파생변수 생성
def create_features(df):
    '''
    설문 응답 기반 행동/태도 파생변수 생성 함수

    - 실제 존재하는 컬럼만 사용하여 KeyError 방지
    - train / test 동일 로직 적용 (데이터 누수 방지)
    '''
    df = df.copy()

# 태도 / 도덕 성향 점수
    cynicism_items = ['Qc', 'Qh', 'Qj', 'Qm', 'Qo', 'Qs']
    morality_items = ['Qf', 'Qk', 'Qr']
    kindness_items = ['Qq']

    cynicism_cols = [col for col in df.columns if col in cynicism_items]
    morality_cols = [col for col in df.columns if col in morality_items]
    kindness_cols = [col for col in df.columns if col in kindness_items]

    if len(cynicism_cols) > 0:
        df['cynicism_score'] = df[cynicism_cols].mean(axis=1)

    if len(morality_cols) > 0:
        df['morality_score'] = df[morality_cols].mean(axis=1)

    if len(kindness_cols) > 0:
        df['kindness_score'] = df[kindness_cols].mean(axis=1)

    # 응답 시간 기반 '성실성' 지표
    time_cols = [col for col in df.columns if col.startswith('Q_E')]

    if len(time_cols) > 0:
        df['mean_response_time'] = df[time_cols].mean(axis=1)
        df['std_response_time'] = df[time_cols].std(axis=1)
        df['long_response_ratio'] = (
            df[time_cols] 
            > df[time_cols].mean(axis=1).values.reshape(-1, 1)
        ). mean(axis=1)

    # 언어/인지 수준 proxy
    wr_cols = [col for col in df.columns if col.startswith('wr_')]
    wf_cols = [col for col in df.columns if col.startswith('wf_')]

    if len(wr_cols) > 0:
        df['real_word_ratio'] = (
            df[wr_cols].sum(axis=1) 
            / (df[wr_cols + wf_cols].sum(axis=1) + 1e-6)
        )
         
    return df

In [33]:
train_df = create_features(train_df)
test_df = create_features(test_df)

In [34]:
# 사용할 Feature 선택
DROP_COLS = ['index', 'voted']
FEATURES = [col for col in train_df.columns if col not in DROP_COLS]

X = train_df[FEATURES]
y = train_df['voted']
X_test = test_df[FEATURES]

In [35]:
# LightGBM 전용 칼럼 
lgb_cat_cols = ['age_group', 'gender', 'race', 'religion']

In [36]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

# 범주형 분리
X_cat_train = train_df[lgb_cat_cols]
X_cat_test = test_df[lgb_cat_cols]

# 수치형 분리
X_num_train = train_df.drop(columns=lgb_cat_cols + ['voted'])
X_num_test = test_df.drop(columns=lgb_cat_cols)

# 원핫인코딩
X_cat_train_onehot = onehot.fit_transform(X_cat_train)
X_cat_test_onehot = onehot.transform(X_cat_test)

# 다시 병합
X = np.hstack([X_num_train.values, X_cat_train_onehot])
X_test = np.hstack([X_num_test.values, X_cat_test_onehot])

y = train_df['voted'].values

In [37]:
# LightGBM
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'verbosity': -1,
    'random_state': 42
}

lgb_model = lgb.LGBMClassifier(
    **lgb_params, 
    n_estimators=1200
)

lgb_model.fit(X, y)


lgb_pred_train = lgb_model.predict_proba(X)[:, 1]
lgb_pred_test = lgb_model.predict_proba(X_test)[:, 1]



In [38]:
# MLP 모델
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x)

In [39]:
# MLP 학습
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

train_ds = TensorDataset(
    torch.tensor(X_scaled, dtype=torch.float32),
    torch.tensor(y, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)

device = 'cpu'
model = MLP(X.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(25):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb).squeeze(), yb)
        loss.backward()
        optimizer.step()

In [40]:
model.eval()
with torch.no_grad():
    dl_pred_train = torch.sigmoid(
        model(torch.tensor(X_scaled, dtype=torch.float32).to(device))
    ).cpu().numpy().ravel()

    dl_pred_test = torch.sigmoid(
        model(torch.tensor(X_test_scaled, dtype=torch.float32).to(device))
    ).cpu().numpy().ravel()

In [41]:
# AUC 기준 가중 앙상블 (DL 비중 up)

final_test_pred = 0.4 * lgb_pred_test + 0.6 * dl_pred_test

submission = pd.DataFrame({
    'index': test_df.index,
    'voted': final_test_pred
})

submission.to_csv('submit_20260129_hana.csv', index=False)

In [1]:
# 연습용 데이콘 제출 점수 : 퍼블릭 - 0.7791
# 머신러닝 + 딥러닝 모델 앙상블... 가능한가??