# Best-Score Pipeline (리더 보드 0.775513375)
- LGBM: feature selection only
- Final: Deep Learning (MLP)
- Submit: P(voted==2) with sample_submission format


In [None]:
# 필요할 때만 실행
# !pip install --default-timeout=300 lightgbm tensorflow scikit-learn


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 200)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() or (p / "README.md").exists():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. vote-AI 루트에 requirements.txt 또는 README.md가 있는지 확인하세요.")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data" / "raw"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test_x.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

y = (train["voted"] == 2).astype("int32")
X_raw = train.drop(columns=["voted"])

cols = list(X_raw.columns)
q_like = [c for c in cols if re.match(r"^Q", c)]
QA_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]A", c)])
QE_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]E", c)])
TP_cols = sorted([c for c in cols if re.fullmatch(r"tp\d{2}", c)])
WR_cols = sorted([c for c in cols if re.fullmatch(r"wr_?\d{2}", c)])
WF_cols = sorted([c for c in cols if re.fullmatch(r"wf_?\d{2}", c)])

print("train/test:", train.shape, test.shape, "pos_ratio:", float(y.mean()))
print("groups:", "QA", len(QA_cols), "QE", len(QE_cols), "TP", len(TP_cols), "WR", len(WR_cols), "WF", len(WF_cols))


train/test: (45532, 78) (11383, 77) pos_ratio: 0.5468242115435298
groups: QA 20 QE 20 TP 10 WR 13 WF 3


In [2]:
USE_REVERSE_MACH = True

FLIP_PUBLIC = ["QeA","QfA","QkA","QqA","QrA"]
FLIP_SECRET = ["QaA","QdA","QgA","QiA","QnA"]

def age_group_to_ord(v):
    try: return int(str(v).replace("s",""))
    except: return -1

def urban_to_ord(v):
    try:
        iv=int(v); return -1 if iv==0 else iv
    except: return -1

def fe(df: pd.DataFrame, tp_means=None):
    df = df.copy()
    if "index" in df.columns:
        df = df.drop(columns=["index"])

    for c in QA_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in QE_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in TP_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WR_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WF_cols: df[c] = pd.to_numeric(df[c], errors="coerce")

    if "age_group" in df.columns:
        df["age_group_ord"] = df["age_group"].astype(str).apply(age_group_to_ord)
    if "urban" in df.columns:
        df["urban_ord"] = df["urban"].apply(urban_to_ord)

    for c in ["education","urban","hand","married"]:
        if c in df.columns:
            tmp = pd.to_numeric(df[c], errors="coerce").fillna(0)
            df[f"{c}_is_missing"] = (tmp==0).astype("int32")

    if USE_REVERSE_MACH:
        for c in FLIP_PUBLIC + FLIP_SECRET:
            if c in df.columns: df[c] = 6 - df[c]

    qa = df[QA_cols]
    df["qa_mean"] = qa.mean(axis=1)
    df["qa_std"] = qa.std(axis=1)
    df["neutral_ratio"] = (qa==3).mean(axis=1)
    df["confident_ratio"] = ((qa==1)|(qa==5)).mean(axis=1)
    df["qa_missing_ratio"] = qa.isna().mean(axis=1)

    if USE_REVERSE_MACH:
        df["T"] = df["QcA"] - df["QfA"] + df["QoA"] - df["QrA"] + df["QsA"]
        df["V"] = df["QbA"] - df["QeA"] + df["QhA"] + df["QjA"] + df["QmA"] - df["QqA"]
        df["M"] = -df["QkA"]
        df["Mach_score"] = qa.mean(axis=1)

    qe = df[QE_cols].clip(lower=0)
    qe_log = np.log1p(qe)
    df["delay_sum"] = qe.sum(axis=1)
    df["delay_log"] = np.log1p(df["delay_sum"])
    df["delay_std"] = qe.std(axis=1)
    df["qe_fast_ratio"] = (qe_log<1.0).mean(axis=1)
    df["qe_slow_ratio"] = (qe_log>4.0).mean(axis=1)

    for c in TP_cols:
        df.loc[df[c]==0, c] = np.nan
    if tp_means is None:
        tp_means = {c: float(df[c].mean()) for c in TP_cols}
    for c in TP_cols:
        df[c] = df[c].fillna(tp_means[c])

    df["Ex_diff"] = df["tp01"]-df["tp06"]; df["Ex_strength"]=df["Ex_diff"].abs()
    df["Ag_diff"] = df["tp07"]-df["tp02"]; df["Ag_strength"]=df["Ag_diff"].abs()
    df["Con_diff"]= df["tp03"]-df["tp08"]; df["Con_strength"]=df["Con_diff"].abs()
    df["Es_diff"] = df["tp09"]-df["tp04"]; df["Es_strength"]=df["Es_diff"].abs()
    df["Op_diff"] = df["tp05"]-df["tp10"]; df["Op_strength"]=df["Op_diff"].abs()

    if len(WR_cols)>0: df["wr_sum"] = df[WR_cols].sum(axis=1)
    else: df["wr_sum"]=0
    if len(WF_cols)>0: df["wf_sum"] = df[WF_cols].sum(axis=1)
    else: df["wf_sum"]=0
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]

    # drop raw groups (keep summaries only)
    df = df.drop(columns=[c for c in QA_cols+QE_cols+TP_cols+WR_cols+WF_cols if c in df.columns], errors="ignore")

    # simple encoding for remaining categorical strings
    for c in ["gender","race","religion"]:
        if c in df.columns:
            df[c] = df[c].astype("category").cat.codes.astype("int32")

    df = df.apply(pd.to_numeric, errors="coerce").fillna(0)
    return df, tp_means

X_feat, tp_means_all = fe(X_raw.copy(), tp_means=None)
T_feat, _ = fe(test.copy(), tp_means=tp_means_all)

print("X_feat/T_feat:", X_feat.shape, T_feat.shape)


X_feat/T_feat: (45532, 43) (11383, 43)


  df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
  df["word_credibility"] = df["wr_sum"] - df["wf_sum"]


In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

X_lgb = X_feat.copy()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

importances = np.zeros(X_lgb.shape[1], dtype=float)
auc_list = []

for fold,(tr,va) in enumerate(skf.split(X_lgb, y)):
    X_tr, X_va = X_lgb.iloc[tr], X_lgb.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    m = LGBMClassifier(n_estimators=8000, learning_rate=0.015, num_leaves=255,
                       subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
                       random_state=42+fold)
    m.fit(X_tr, y_tr, eval_set=[(X_va,y_va)], eval_metric="auc",
          callbacks=[early_stopping(400), log_evaluation(0)])
    p = m.predict_proba(X_va)[:,1]
    auc_list.append(roc_auc_score(y_va, p))
    importances += m.booster_.feature_importance(importance_type="gain")

print("LGBM mean AUC:", float(np.mean(auc_list)), "std:", float(np.std(auc_list)))

imp_rank = pd.Series(importances, index=X_lgb.columns).sort_values(ascending=False)

K_CANDIDATES = [30,40,60,80,120,160]
best_k,best_auc=None,-1
skf2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=777)
for k in K_CANDIDATES:
    use = imp_rank.head(k).index.tolist()
    auc2=[]
    for f,(tr2,va2) in enumerate(skf2.split(X_lgb[use], y)):
        mm = LGBMClassifier(n_estimators=3000, learning_rate=0.02, num_leaves=127, random_state=1000+f)
        mm.fit(X_lgb[use].iloc[tr2], y.iloc[tr2])
        pp = mm.predict_proba(X_lgb[use].iloc[va2])[:,1]
        auc2.append(roc_auc_score(y.iloc[va2], pp))
    mean_auc2=float(np.mean(auc2))
    print("K",k,"quick AUC",mean_auc2)
    if mean_auc2>best_auc:
        best_auc, best_k = mean_auc2, k

TOPK = best_k
selected_features = imp_rank.head(TOPK).index.tolist()
print("Selected TOPK:", TOPK)


[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1648
[LightGBM] [Info] Number of data points in the train set: 36425, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546822 -> initscore=0.187839
[LightGBM] [Info] Start training from score 0.187839
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[164]	valid_0's auc: 0.779079	valid_0's binary_logloss: 0.550402
[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1649
[LightGBM] [Inf

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X_sel = X_feat[selected_features].copy()
T_sel = T_feat[selected_features].copy()

def make_ds(Xdf, yarr=None, batch_size=512, shuffle=False, seed=42):
    Xnp = Xdf.values.astype("float32")
    if yarr is None:
        ds = tf.data.Dataset.from_tensor_slices(Xnp)
    else:
        ds = tf.data.Dataset.from_tensor_slices((Xnp, yarr.astype("float32")))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(Xdf), 10000), seed=seed, reshuffle_each_iteration=True)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

def build_mlp(input_dim, lr=1e-3, dropout=0.3):
    inp = keras.Input(shape=(input_dim,), dtype=tf.float32)
    x = layers.BatchNormalization()(inp)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inp, out)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc")])
    return model

FAST = dict(EPOCHS=12, N_SPLITS=3, SEEDS=[42], BATCH=512)
SUBMIT = dict(EPOCHS=25, N_SPLITS=5, SEEDS=[42,202,777], BATCH=256)
MODE="FAST"
CFG = FAST if MODE=="FAST" else SUBMIT
print("MODE:", MODE, "CFG:", CFG)

EPOCHS=CFG["EPOCHS"]; N_SPLITS=CFG["N_SPLITS"]; SEEDS=CFG["SEEDS"]; BATCH=CFG["BATCH"]
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_by_seed=[]
for seed in SEEDS:
    oof=np.zeros(len(X_sel), dtype="float32")
    for fold,(tr,va) in enumerate(skf.split(X_sel, y)):
        X_tr=X_sel.iloc[tr].reset_index(drop=True)
        y_tr=y.iloc[tr].reset_index(drop=True).values
        X_va=X_sel.iloc[va].reset_index(drop=True)
        y_va=y.iloc[va].reset_index(drop=True).values

        pos=float(y_tr.mean()); neg=1.0-pos
        class_weight={0:1.0, 1:neg/(pos+1e-9)}

        tf.keras.utils.set_random_seed(seed+fold)
        model = build_mlp(X_tr.shape[1], lr=1e-3, dropout=0.3)

        tr_ds=make_ds(X_tr, y_tr, batch_size=BATCH, shuffle=True, seed=seed+fold)
        va_ds=make_ds(X_va, y_va, batch_size=BATCH, shuffle=False)

        cb=[
            keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=1, min_lr=1e-5),
        ]
        model.fit(tr_ds, validation_data=va_ds, epochs=EPOCHS, verbose=0, callbacks=cb, class_weight=class_weight)

        pred=model.predict(make_ds(X_va, None, batch_size=BATCH), verbose=0).reshape(-1)
        pred=np.nan_to_num(pred, nan=0.5)
        oof[va]=pred
        print(f"[seed {seed}] fold {fold} AUC={roc_auc_score(y_va, pred):.6f}")

    auc_seed=roc_auc_score(y, oof)
    oof_by_seed.append(oof)
    print(f"\n[seed {seed}] OOF AUC: {auc_seed:.6f}")
    print('-'*60)

oof_ens=np.mean(np.vstack(oof_by_seed), axis=0)
print("\n✅ Ensemble OOF AUC:", roc_auc_score(y, oof_ens))


MODE: FAST CFG: {'EPOCHS': 12, 'N_SPLITS': 3, 'SEEDS': [42], 'BATCH': 512}
[seed 42] fold 0 AUC=0.764733
[seed 42] fold 1 AUC=0.756909
[seed 42] fold 2 AUC=0.755138

[seed 42] OOF AUC: 0.758823
------------------------------------------------------------

✅ Ensemble OOF AUC: 0.7588231633801006


In [5]:
# ---- Final submission (딥러닝-only) ----
FINAL_SEEDS = SUBMIT["SEEDS"]
EPOCHS_SUB = SUBMIT["EPOCHS"]
BATCH_SUB = SUBMIT["BATCH"]

def train_full_predict(seed):
    tf.keras.utils.set_random_seed(seed)
    model = build_mlp(X_sel.shape[1], lr=1e-3, dropout=0.3)

    pos=float(y.mean()); neg=1.0-pos
    class_weight={0:1.0, 1:neg/(pos+1e-9)}

    tr_ds=make_ds(X_sel, y.values, batch_size=BATCH_SUB, shuffle=True, seed=seed)
    model.fit(tr_ds, epochs=max(10, EPOCHS_SUB//2), verbose=0, class_weight=class_weight)
    pred=model.predict(make_ds(T_sel, None, batch_size=BATCH_SUB), verbose=0).reshape(-1)
    return np.nan_to_num(pred, nan=0.5)

pred_test = np.mean(np.vstack([train_full_predict(s) for s in FINAL_SEEDS]), axis=0).reshape(-1)

submission = sub.copy()
submission["voted"] = pred_test

print("submission:", submission.shape, submission.columns.tolist())
print(submission["voted"].describe())

assert submission.shape == (11383, 2)
assert submission.columns.tolist() == ["index","voted"]
assert float(submission["voted"].min()) >= 0.0 and float(submission["voted"].max()) <= 1.0

out_path = PROJECT_ROOT / "submission_best.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


submission: (11383, 2) ['index', 'voted']
count    11383.000000
mean         0.520519
std          0.244247
min          0.087032
25%          0.331329
50%          0.432507
75%          0.716390
max          0.998735
Name: voted, dtype: float64
Saved: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/submission_best.csv
