# 03_best_score_

목표
- **최종 제출은 딥러닝 모델만** 사용
- 다만, **피처(변수) 중요도 확인/선별**은 LightGBM을 도구로 사용(규칙 허용)
- 제출값은 **voted=2 확률**로 통일

구성(한 줄)
1) 설문 데이터를 요약 피처로 만들고 → 2) LGBM으로 중요한 피처만 고른 뒤 → 3) 그 피처로 딥러닝(MLP) 학습/제출


## 0) (필요 시) 설치
- 이미 설치돼 있으면 건너뛰세요.


In [None]:
# 필요할 때만 실행하세요.
# !pip install --default-timeout=300 lightgbm tensorflow scikit-learn


## 1) 데이터 로드 (팀 공용 경로)
- 프로젝트 루트를 자동으로 찾아서, 어디서 실행해도 동일하게 동작하도록 했습니다.


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 200)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() or (p / "README.md").exists():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. (requirements.txt 또는 README.md 위치 확인)")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data" / "raw"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test_x.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

print("train:", train.shape, "test:", test.shape, "sub:", sub.shape)

# 타깃: voted==2 확률 제출을 위해 y를 voted==2로 고정
y = (train["voted"] == 2).astype("int32")
X_raw = train.drop(columns=["voted"])

print("pos ratio (voted==2):", float(y.mean()))


train: (45532, 78) test: (11383, 77) sub: (11383, 2)
pos ratio (voted==2): 0.5468242115435298


## 2) 컬럼 그룹 자동 탐지
- 설문 응답(QA), 응답 시간(QE), 성격(tp), 단어(wr/wf) 그룹을 자동으로 잡습니다.


In [2]:
cols = list(X_raw.columns)
q_like = [c for c in cols if c.startswith("Q")]

QA_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]A", c)])
QE_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]E", c)])

TP_cols = sorted([c for c in cols if re.fullmatch(r"tp\d{2}", c)])
WR_cols = sorted([c for c in cols if re.fullmatch(r"wr_?\d{2}", c)])
WF_cols = sorted([c for c in cols if re.fullmatch(r"wf_?\d{2}", c)])

print("QA/QE/TP/WR/WF:", len(QA_cols), len(QE_cols), len(TP_cols), len(WR_cols), len(WF_cols))


QA/QE/TP/WR/WF: 20 20 10 13 3


## 3) Feature Engineering


# Feature
- 태도: 평균/흔들림/중립비율/극단비율/무응답비율
- 응답태도: 응답시간 합/표준편차/너무 빠름·느림 비율
- 성격: Big5 차이(diff)와 강도(|diff|)
- 단어: wr합, wf합, (wr-wf)
- 인구통계: 나이대/urban은 순서형 숫자로 변환 + 무응답 여부 표시

Reverse+Mach는 도움이 될 때가 있고 아닐 때가 있어서 스위치로 둡니다.


In [7]:
USE_REVERSE_MACH = False  # True/False 둘 다 돌려보고 높은 쪽을 쓰면 됨

FLIP_PUBLIC = ["QeA","QfA","QkA","QqA","QrA"]
FLIP_SECRET = ["QaA","QdA","QgA","QiA","QnA"]

def age_group_to_ord(v):
    # '10s' -> 10, '20s' -> 20 ...
    try:
        return int(str(v).replace("s",""))
    except:
        return -1

def urban_to_ord(v):
    # 0은 무응답 취급(-1), 1/2/3은 유지
    try:
        iv = int(v)
        return -1 if iv == 0 else iv
    except:
        return -1

def build_features(df: pd.DataFrame, tp_means=None):
    df = df.copy()

    # index는 제출용이므로 피처에서는 제거
    if "index" in df.columns:
        df = df.drop(columns=["index"])

    # QA/QE/TP/WR/WF 숫자화
    for c in QA_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in QE_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in TP_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WR_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WF_cols: df[c] = pd.to_numeric(df[c], errors="coerce")

    # (1) 인구통계 순서형 변환 + 무응답 indicator
    if "age_group" in df.columns:
        df["age_group_ord"] = df["age_group"].astype(str).apply(age_group_to_ord)
    if "urban" in df.columns:
        df["urban_ord"] = df["urban"].apply(urban_to_ord)

    for c in ["education","urban","hand","married"]:
        if c in df.columns:
            tmp = pd.to_numeric(df[c], errors="coerce").fillna(0)
            df[f"{c}_is_missing"] = (tmp == 0).astype("int32")

    # (2) QA reverse + Mach (옵션)
    if USE_REVERSE_MACH:
        for c in FLIP_PUBLIC + FLIP_SECRET:
            if c in df.columns:
                df[c] = 6 - df[c]

    qa = df[QA_cols]
    df["att_mean"] = qa.mean(axis=1)
    df["att_std"] = qa.std(axis=1)
    df["ratio_neutral"] = (qa == 3).mean(axis=1)
    df["ratio_extreme"] = ((qa == 1) | (qa == 5)).mean(axis=1)
    df["ratio_qa_missing"] = qa.isna().mean(axis=1)

    if USE_REVERSE_MACH:
        df["mach_T"] = df["QcA"] - df["QfA"] + df["QoA"] - df["QrA"] + df["QsA"]
        df["mach_V"] = df["QbA"] - df["QeA"] + df["QhA"] + df["QjA"] + df["QmA"] - df["QqA"]
        df["mach_M"] = -df["QkA"]
        df["mach_mean"] = qa.mean(axis=1)

    # (3) QE delay 요약
    qe = df[QE_cols].clip(lower=0)
    qe_log = np.log1p(qe)
    df["time_sum"] = qe.sum(axis=1)
    df["time_std"] = qe.std(axis=1)
    df["ratio_fast"] = (qe_log < 1.0).mean(axis=1)
    df["ratio_slow"] = (qe_log > 4.0).mean(axis=1)

    # (4) Big5: tp는 0을 무응답으로 보고 train 평균으로 채움
    for c in TP_cols:
        df.loc[df[c] == 0, c] = np.nan

    if tp_means is None:
        tp_means = {c: float(df[c].mean()) for c in TP_cols}
    for c in TP_cols:
        df[c] = df[c].fillna(tp_means[c])

    df["Ex_diff"] = df["tp01"] - df["tp06"]; df["Ex_strength"] = df["Ex_diff"].abs()
    df["Ag_diff"] = df["tp07"] - df["tp02"]; df["Ag_strength"] = df["Ag_diff"].abs()
    df["Con_diff"] = df["tp03"] - df["tp08"]; df["Con_strength"] = df["Con_diff"].abs()
    df["Es_diff"] = df["tp09"] - df["tp04"]; df["Es_strength"] = df["Es_diff"].abs()
    df["Op_diff"] = df["tp05"] - df["tp10"]; df["Op_strength"] = df["Op_diff"].abs()

    # (5) WR/WF 요약
    df["wr_sum"] = df[WR_cols].sum(axis=1) if len(WR_cols) else 0
    df["wf_sum"] = df[WF_cols].sum(axis=1) if len(WF_cols) else 0
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]

    # 원본 그룹은 노이즈가 많아서 제거 (요약만 남김)
    drop_cols = QA_cols + QE_cols + TP_cols + WR_cols + WF_cols
    df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

    # 남아있는 문자열 범주형을 간단히 숫자코드로 변환
    # (최종 모델은 딥러닝이지만 여기선 입력을 깔끔하게 만들기 위한 변환)
    for c in ["gender","race","religion"]:
        if c in df.columns:
            df[c] = df[c].astype("category").cat.codes.astype("int32")

    # 최종: 전부 숫자로 만들고 결측은 0
    df = df.apply(pd.to_numeric, errors="coerce").fillna(0)
    return df, tp_means

X_feat, tp_means = build_features(X_raw, tp_means=None)
T_feat, _ = build_features(test, tp_means=tp_means)

print("X_feat:", X_feat.shape, "T_feat:", T_feat.shape)
X_feat.head()


X_feat: (45532, 38) T_feat: (11383, 38)


Unnamed: 0,age_group,education,engnat,familysize,gender,hand,married,race,religion,urban,age_group_ord,urban_ord,education_is_missing,urban_is_missing,hand_is_missing,married_is_missing,att_mean,att_std,ratio_neutral,ratio_extreme,ratio_qa_missing,time_sum,time_std,ratio_fast,ratio_slow,Ex_diff,Ex_strength,Ag_diff,Ag_strength,Con_diff,Con_strength,Es_diff,Es_strength,Op_diff,Op_strength,wr_sum,wf_sum,word_credibility
0,0.0,2,1,4,0,1,3,6,10,1,30,1,0,0,0,0,3.05,1.394538,0.05,0.3,0.0,17874,323.295058,0.0,1.0,1.0,1.0,5.0,5.0,-2.0,2.0,3.0,3.0,-1.0,1.0,7,0,7
1,0.0,4,2,3,0,1,1,1,7,3,20,3,0,0,0,0,3.2,1.794729,0.15,0.75,0.0,49565,1264.031057,0.0,1.0,-1.0,1.0,2.0,2.0,-1.628942,1.628942,-0.6306,0.6306,-3.0,3.0,8,0,8
2,0.0,3,1,3,1,1,2,6,10,2,30,2,0,0,0,0,2.8,1.641565,0.05,0.55,0.0,29661,693.931096,0.0,1.0,-2.0,2.0,-1.0,1.0,-5.0,5.0,-4.0,4.0,0.0,0.0,10,1,9
3,0.0,4,2,0,0,1,1,1,7,3,20,3,0,0,0,0,3.55,0.944513,0.3,0.15,0.0,72714,5389.724786,0.0,1.0,-1.0,1.0,-3.0,3.0,-2.0,2.0,0.0,0.0,-2.0,2.0,5,0,5
4,0.0,3,1,2,1,1,2,6,0,1,20,1,0,0,0,0,2.9,1.803505,0.0,0.7,0.0,18487,366.429756,0.0,1.0,-1.0,1.0,1.588953,1.588953,-5.0,5.0,-4.0,4.0,-3.956082,3.956082,11,1,10


## 4) LGBM으로 중요 피처 TopK 선택 
여기서 중요한 점:
- one-hot을 섞지 않고 **지금 만든 numeric 피처만**으로 importance를 뽑는다.
그래야 TopK가 정확히 딥러닝 입력으로 이어진다.


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

X_lgb = X_feat.copy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
importances = np.zeros(X_lgb.shape[1], dtype=float)
auc_list = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_lgb, y)):
    X_tr, X_va = X_lgb.iloc[tr_idx], X_lgb.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = LGBMClassifier(
        n_estimators=8000, learning_rate=0.015,
        num_leaves=255, subsample=0.8, colsample_bytree=0.8,
        reg_lambda=1.0, random_state=42+fold
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        callbacks=[early_stopping(400), log_evaluation(0)]
    )
    pred = model.predict_proba(X_va)[:, 1]
    auc_list.append(roc_auc_score(y_va, pred))
    importances += model.booster_.feature_importance(importance_type="gain")

print("LGBM CV AUC mean:", float(np.mean(auc_list)), "std:", float(np.std(auc_list)))

imp_rank = pd.Series(importances, index=X_lgb.columns).sort_values(ascending=False)

# K는 경험상 40~160 사이가 자주 잘 먹힘
K_CANDIDATES = [40, 60, 80, 120, 160]
best_k, best_auc = None, -1

skf2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=777)
for k in K_CANDIDATES:
    use = imp_rank.head(k).index.tolist()
    auc2 = []
    for f, (tr, va) in enumerate(skf2.split(X_lgb[use], y)):
        m = LGBMClassifier(n_estimators=3000, learning_rate=0.02, num_leaves=127, random_state=1000+f)
        m.fit(X_lgb[use].iloc[tr], y.iloc[tr])
        p = m.predict_proba(X_lgb[use].iloc[va])[:, 1]
        auc2.append(roc_auc_score(y.iloc[va], p))
    mean_auc2 = float(np.mean(auc2))
    print("K", k, "quick AUC", mean_auc2)
    if mean_auc2 > best_auc:
        best_auc, best_k = mean_auc2, k

TOPK = best_k
selected_features = imp_rank.head(TOPK).index.tolist()
print("\n Selected TOPK =", TOPK)


[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1228
[LightGBM] [Info] Number of data points in the train set: 36425, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546822 -> initscore=0.187839
[LightGBM] [Info] Start training from score 0.187839
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[212]	valid_0's auc: 0.778883	valid_0's binary_logloss: 0.546514
[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_c

## 5) 최종 딥러닝 모델(MLP) 학습 + 제출 생성
여기서부터는 **딥러닝만** 사용함


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X_sel = X_feat[selected_features].copy()
T_sel = T_feat[selected_features].copy()

def make_ds(Xdf, yarr=None, batch_size=512, shuffle=False, seed=42):
    Xnp = Xdf.values.astype("float32")
    if yarr is None:
        ds = tf.data.Dataset.from_tensor_slices(Xnp)
    else:
        ds = tf.data.Dataset.from_tensor_slices((Xnp, yarr.astype("float32")))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(Xdf), 10000), seed=seed, reshuffle_each_iteration=True)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

def build_mlp(input_dim, lr=1e-3, dropout=0.3):
    inp = keras.Input(shape=(input_dim,), dtype=tf.float32)
    x = layers.BatchNormalization()(inp)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inp, out)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc")])
    return model

# FAST / SUBMIT
FAST = dict(EPOCHS=12, N_SPLITS=3, SEEDS=[42], BATCH=512)
SUBMIT = dict(EPOCHS=25, N_SPLITS=5, SEEDS=[42,202,777], BATCH=256)

MODE = "FAST"   # 먼저 FAST로 OOF 확인 후 SUBMIT로 바꾸는 것을 권장
CFG = FAST if MODE=="FAST" else SUBMIT
print("MODE:", MODE, "CFG:", CFG)

EPOCHS = CFG["EPOCHS"]; N_SPLITS = CFG["N_SPLITS"]; SEEDS = CFG["SEEDS"]; BATCH = CFG["BATCH"]
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_by_seed = []
for seed in SEEDS:
    oof = np.zeros(len(X_sel), dtype="float32")
    for fold, (tr, va) in enumerate(skf.split(X_sel, y)):
        X_tr = X_sel.iloc[tr].reset_index(drop=True)
        y_tr = y.iloc[tr].reset_index(drop=True).values
        X_va = X_sel.iloc[va].reset_index(drop=True)
        y_va = y.iloc[va].reset_index(drop=True).values

        pos = float(y_tr.mean()); neg = 1.0 - pos
        class_weight = {0: 1.0, 1: neg/(pos+1e-9)}

        tf.keras.utils.set_random_seed(seed + fold)
        model = build_mlp(X_tr.shape[1], lr=1e-3, dropout=0.3)

        tr_ds = make_ds(X_tr, y_tr, batch_size=BATCH, shuffle=True, seed=seed+fold)
        va_ds = make_ds(X_va, y_va, batch_size=BATCH, shuffle=False)

        cb = [
            keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=1, min_lr=1e-5),
        ]

        model.fit(tr_ds, validation_data=va_ds, epochs=EPOCHS, verbose=0, callbacks=cb, class_weight=class_weight)

        pred = model.predict(make_ds(X_va, None, batch_size=BATCH), verbose=0).reshape(-1)
        pred = np.nan_to_num(pred, nan=0.5)
        oof[va] = pred
        print(f"[seed {seed}] fold {fold} AUC={roc_auc_score(y_va, pred):.6f}")

    auc_seed = roc_auc_score(y, oof)
    oof_by_seed.append(oof)
    print(f"\n[seed {seed}] OOF AUC: {auc_seed:.6f}")
    print("-"*60)

oof_ens = np.mean(np.vstack(oof_by_seed), axis=0)
print("\n✅ Ensemble OOF AUC:", roc_auc_score(y, oof_ens))

# ---- Final submission (SUBMIT 모드에서만 권장) ----
FINAL_SEEDS = SUBMIT["SEEDS"]
EPOCHS_SUB = SUBMIT["EPOCHS"]
BATCH_SUB = SUBMIT["BATCH"]

def train_full_predict(seed):
    tf.keras.utils.set_random_seed(seed)
    model = build_mlp(X_sel.shape[1], lr=1e-3, dropout=0.3)

    pos = float(y.mean()); neg = 1.0 - pos
    class_weight = {0: 1.0, 1: neg/(pos+1e-9)}

    tr_ds = make_ds(X_sel, y.values, batch_size=BATCH_SUB, shuffle=True, seed=seed)
    model.fit(tr_ds, epochs=max(10, EPOCHS_SUB//2), verbose=0, class_weight=class_weight)

    pred = model.predict(make_ds(T_sel, None, batch_size=BATCH_SUB), verbose=0).reshape(-1)
    return np.nan_to_num(pred, nan=0.5)

pred_test = np.mean(np.vstack([train_full_predict(s) for s in FINAL_SEEDS]), axis=0).reshape(-1)

submission = sub.copy()
submission["voted"] = pred_test

print("submission:", submission.shape, submission.columns.tolist())
print(submission["voted"].describe())

assert submission.shape == (11383, 2)
assert submission.columns.tolist() == ["index","voted"]
assert float(submission["voted"].min()) >= 0.0 and float(submission["voted"].max()) <= 1.0

out_path = PROJECT_ROOT / "submission_best_score.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


MODE: FAST CFG: {'EPOCHS': 12, 'N_SPLITS': 3, 'SEEDS': [42], 'BATCH': 512}
[seed 42] fold 0 AUC=0.764611
[seed 42] fold 1 AUC=0.756534
[seed 42] fold 2 AUC=0.756867

[seed 42] OOF AUC: 0.759281
------------------------------------------------------------

✅ Ensemble OOF AUC: 0.7592811938191976
submission: (11383, 2) ['index', 'voted']
count    11383.000000
mean         0.520863
std          0.245189
min          0.087193
25%          0.329722
50%          0.431084
75%          0.724007
max          0.997267
Name: voted, dtype: float64
Saved: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/submission_best_score.csv


In [None]:
# OOF AUC: 0.759281 --> SE_REVERSE_MACH = True