# Best Score (Final DL) — LGBM Feature Selection + Embedding MLP (Auto-Choose)

목표: 가능한 한 높은 점수 (규칙 준수)

규칙 준수
- 최종 제출 모델: 딥러닝(Embedding + MLP)
- LightGBM: 피처 중요도/선별용 도구로만 사용

이 노트북이 자동으로 하는 것
1) Feature Engineering 2가지 옵션(A: reverse+mach, B: no reverse+mach)
2) 옵션별로 LightGBM 중요도 기반 Top-K numeric 선택
3) (빠른 검증) 3-fold/seed=42로 옵션/TopK 조합을 자동 선택
4) (제출) 선택된 설정으로 5-fold + seed 앙상블(42/202/777)로 학습 후 제출 파일 생성

제출
- voted 컬럼에는 P(voted==2)만 저장
- sample_submission.csv 형식(index+voted) 강제


In [None]:
# 필요할 때만 실행하세요.
# !pip install --default-timeout=300 lightgbm tensorflow scikit-learn


## 1) 데이터 로드


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 200)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() or (p / "README.md").exists():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. vote-AI 루트에 requirements.txt 또는 README.md가 있는지 확인하세요.")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data" / "raw"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test_x.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

# ✅ target fixed: voted==2 is positive
y = (train["voted"] == 2).astype("int32")
X_raw = train.drop(columns=["voted"])

print("train/test/sub:", train.shape, test.shape, sub.shape)
print("pos_ratio(voted==2):", float(y.mean()))


train/test/sub: (45532, 78) (11383, 77) (11383, 2)
pos_ratio(voted==2): 0.5468242115435298


## 2) 컬럼 그룹 탐지


In [2]:
cols = list(X_raw.columns)
q_like = [c for c in cols if c.startswith("Q")]

QA_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]A", c)])
QE_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]E", c)])

TP_cols = sorted([c for c in cols if re.fullmatch(r"tp\d{2}", c)])
WR_cols = sorted([c for c in cols if re.fullmatch(r"wr_?\d{2}", c)])
WF_cols = sorted([c for c in cols if re.fullmatch(r"wf_?\d{2}", c)])

print("QA/QE/TP/WR/WF:", len(QA_cols), len(QE_cols), len(TP_cols), len(WR_cols), len(WF_cols))


QA/QE/TP/WR/WF: 20 20 10 13 3


## 3) Feature Engineering (옵션 A/B)
- 공통: 태도(QA) 요약 + 응답시간(QE) 요약 + Big5(diff/strength) + 단어(wr/wf) 요약 + 결측 indicator
- 옵션 A: reverse+mach 포함
- 옵션 B: reverse+mach 제외

주의: 학습/선별에 필요한 정보는 남기되, 노이즈 큰 원본 그룹(QE, TP, WR/WF)은 요약 후 제거합니다.
QA 원본은 (옵션 A에서 reverse 반영 후) LGBM이 중요 문항을 고를 수 있게 'numeric 후보군'으로 남깁니다.


In [3]:
FLIP_PUBLIC = ["QeA","QfA","QkA","QqA","QrA"]
FLIP_SECRET = ["QaA","QdA","QgA","QiA","QnA"]
TPS = [f"tp{i:02d}" for i in range(1, 11)]

CAT_COLS = ["age_group","gender","race","religion","education","engnat","married","urban","hand"]
MISS0_COLS = ["education","urban","hand","married"]

def age_group_to_ord(v):
    try: return int(str(v).replace("s",""))
    except: return -1

def urban_to_ord(v):
    try:
        iv = int(v)
        return -1 if iv == 0 else iv
    except:
        return -1

def build_features(df: pd.DataFrame, *, tp_means=None, use_reverse_mach=False):
    df = df.copy()

    # keep index out of features
    if "index" in df.columns:
        df = df.drop(columns=["index"])

    # numeric cast
    for c in QA_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in QE_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in TP_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WR_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in WF_cols: df[c] = pd.to_numeric(df[c], errors="coerce")

    # missing indicators from 0
    for c in MISS0_COLS:
        if c in df.columns:
            tmp = pd.to_numeric(df[c], errors="coerce").fillna(0)
            df[f"{c}_is_missing"] = (tmp == 0).astype("int32")

    # ord features
    if "age_group" in df.columns:
        df["age_group_ord"] = df["age_group"].astype(str).apply(age_group_to_ord)
    if "urban" in df.columns:
        df["urban_ord"] = df["urban"].apply(urban_to_ord)

    # QA reverse (optional)
    if use_reverse_mach:
        for c in FLIP_PUBLIC + FLIP_SECRET:
            if c in df.columns:
                df[c] = 6 - df[c]

    # QA style
    qa = df[QA_cols]
    df["qa_mean"] = qa.mean(axis=1)
    df["qa_std"]  = qa.std(axis=1)
    df["neutral_ratio"] = (qa == 3).mean(axis=1)
    df["extreme_ratio"] = ((qa == 1) | (qa == 5)).mean(axis=1)
    df["qa_missing_ratio"] = qa.isna().mean(axis=1)

    # Mach (optional)
    if use_reverse_mach:
        df["mach_T"] = df["QcA"] - df["QfA"] + df["QoA"] - df["QrA"] + df["QsA"]
        df["mach_V"] = df["QbA"] - df["QeA"] + df["QhA"] + df["QjA"] + df["QmA"] - df["QqA"]
        df["mach_M"] = -df["QkA"]
        df["mach_mean"] = qa.mean(axis=1)

    # QE delay (drop QE after)
    qe = df[QE_cols].clip(lower=0)
    qe_log = np.log1p(qe)
    df["delay_sum"] = qe.sum(axis=1)
    df["delay_log"] = np.log1p(df["delay_sum"])
    df["delay_std"] = qe.std(axis=1)
    df["qe_fast_ratio"] = (qe_log < 1.0).mean(axis=1)
    df["qe_slow_ratio"] = (qe_log > 4.0).mean(axis=1)
    df = df.drop(columns=QE_cols, errors="ignore")

    # WR/WF summary (drop raw after)
    df["wr_sum"] = df[WR_cols].sum(axis=1) if len(WR_cols) else 0
    df["wf_sum"] = df[WF_cols].sum(axis=1) if len(WF_cols) else 0
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df = df.drop(columns=WR_cols + WF_cols, errors="ignore")

    # TIPI -> big5 diff/strength (drop TP raw after)
    for c in TP_cols:
        df.loc[df[c] == 0, c] = np.nan
    if tp_means is None:
        tp_means = {c: float(df[c].mean()) for c in TP_cols}
    for c in TP_cols:
        df[c] = df[c].fillna(tp_means[c])

    df["Ex_diff"] = df["tp01"]-df["tp06"]; df["Ex_strength"]=df["Ex_diff"].abs()
    df["Ag_diff"] = df["tp07"]-df["tp02"]; df["Ag_strength"]=df["Ag_diff"].abs()
    df["Con_diff"]= df["tp03"]-df["tp08"]; df["Con_strength"]=df["Con_diff"].abs()
    df["Es_diff"] = df["tp09"]-df["tp04"]; df["Es_strength"]=df["Es_diff"].abs()
    df["Op_diff"] = df["tp05"]-df["tp10"]; df["Op_strength"]=df["Op_diff"].abs()
    df = df.drop(columns=TP_cols, errors="ignore")

    # categorical columns as string (for embedding)
    cat_df = df[CAT_COLS].copy()
    for c in CAT_COLS:
        cat_df[c] = cat_df[c].astype(str)

    # numeric columns: everything else
    num_df = df.drop(columns=CAT_COLS, errors="ignore").apply(pd.to_numeric, errors="coerce").fillna(0)

    return cat_df, num_df, tp_means

# Build option A/B
catA, numA, tp_means = build_features(X_raw, tp_means=None, use_reverse_mach=True)
catB, numB, _ = build_features(X_raw, tp_means=tp_means, use_reverse_mach=False)  # tp_means fixed from train

tcatA, tnumA, _ = build_features(test, tp_means=tp_means, use_reverse_mach=True)
tcatB, tnumB, _ = build_features(test, tp_means=tp_means, use_reverse_mach=False)

print("OptionA numeric dim:", numA.shape, "OptionB numeric dim:", numB.shape)


OptionA numeric dim: (45532, 54) OptionB numeric dim: (45532, 50)


## 4) LGBM으로 numeric-only 중요도 → TopK 선택 (옵션별)


In [4]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def lgb_select(num_df: pd.DataFrame, y: pd.Series, k_candidates=(40,60,80,120,160)):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    importances = np.zeros(num_df.shape[1], dtype=float)
    aucs = []
    for fold,(tr,va) in enumerate(skf.split(num_df, y)):
        X_tr, X_va = num_df.iloc[tr], num_df.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        m = LGBMClassifier(
            n_estimators=6000, learning_rate=0.02, num_leaves=127,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            random_state=42+fold
        )
        m.fit(X_tr, y_tr, eval_set=[(X_va,y_va)], eval_metric="auc",
              callbacks=[early_stopping(250), log_evaluation(0)])
        p = m.predict_proba(X_va)[:,1]
        aucs.append(roc_auc_score(y_va, p))
        importances += m.booster_.feature_importance(importance_type="gain")
    imp = pd.Series(importances, index=num_df.columns).sort_values(ascending=False)

    # quick choose K
    skf2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=777)
    best_k, best_auc = None, -1
    for k in k_candidates:
        use = imp.head(k).index.tolist()
        a2=[]
        for f,(tr,va) in enumerate(skf2.split(num_df[use], y)):
            mm = LGBMClassifier(n_estimators=2500, learning_rate=0.03, random_state=1000+f)
            mm.fit(num_df[use].iloc[tr], y.iloc[tr])
            pp = mm.predict_proba(num_df[use].iloc[va])[:,1]
            a2.append(roc_auc_score(y.iloc[va], pp))
        mean_a2 = float(np.mean(a2))
        if mean_a2 > best_auc:
            best_auc, best_k = mean_a2, k
    return imp, best_k, float(np.mean(aucs))

impA, topkA, lgbA = lgb_select(numA, y, k_candidates=(40,60,80,120,160))
impB, topkB, lgbB = lgb_select(numB, y, k_candidates=(40,60,80,120,160))

selA = impA.head(topkA).index.tolist()
selB = impB.head(topkB).index.tolist()

print("LGB A mean AUC:", lgbA, "TopK:", topkA, "selected:", len(selA))
print("LGB B mean AUC:", lgbB, "TopK:", topkB, "selected:", len(selB))


[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 36425, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546822 -> initscore=0.187839
[LightGBM] [Info] Start training from score 0.187839
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[170]	valid_0's auc: 0.745379	valid_0's binary_logloss: 0.583377
[LightGBM] [Info] Number of positive: 19918, number of negative: 16507
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_c

## 5) 딥러닝 모델 (Embedding + MLP) — 옵션 A/B quick OOF 비교 후 자동 선택


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def df_to_ds(cat_df, num_df, y_=None, batch=512, shuffle=False, seed=202):
    feats = {c: cat_df[c].astype(str).values for c in cat_df.columns}
    feats["num"] = num_df.values.astype("float32")
    if y_ is None:
        ds = tf.data.Dataset.from_tensor_slices(feats)
    else:
        ds = tf.data.Dataset.from_tensor_slices((feats, y_.values.astype("float32")))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(num_df), 10000), seed=seed, reshuffle_each_iteration=True)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

def build_embed_mlp(cat_df_train, num_train, emb_dim=16, lr=3e-4, dropout=0.3):
    inputs = {}
    enc = []

    # categorical embeddings
    for c in cat_df_train.columns:
        inp = keras.Input(shape=(1,), name=c, dtype=tf.string)
        lookup = layers.StringLookup(output_mode="int")
        lookup.adapt(cat_df_train[c].astype(str).values)  # train fold only
        vocab = lookup.vocabulary_size()
        dim = min(emb_dim, max(2, int(np.ceil(vocab**0.25)*2)))
        x = lookup(inp)
        x = layers.Embedding(vocab, dim)(x)
        x = layers.Reshape((dim,))(x)
        inputs[c]=inp
        enc.append(x)

    # numeric
    num_inp = keras.Input(shape=(num_train.shape[1],), name="num", dtype=tf.float32)
    norm = layers.Normalization()
    norm.adapt(num_train.values.astype("float32"))  # train fold only
    xnum = norm(num_inp)
    inputs["num"]=num_inp
    enc.append(xnum)

    x = layers.Concatenate()(enc)
    x = layers.Dense(256)(x); x = layers.BatchNormalization()(x); x = layers.Activation("relu")(x); x = layers.Dropout(dropout)(x)
    x = layers.Dense(128)(x); x = layers.BatchNormalization()(x); x = layers.Activation("relu")(x); x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=keras.optimizers.AdamW(learning_rate=lr, weight_decay=3e-4),
                  loss="binary_crossentropy", metrics=[keras.metrics.AUC(name="auc")])
    return model

def quick_oof(cat_df, num_df, y, epochs=12, splits=3, seed=42, batch=512):
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    oof = np.zeros(len(num_df), dtype="float32")
    for fold,(tr,va) in enumerate(skf.split(num_df, y)):
        cat_tr, cat_va = cat_df.iloc[tr].reset_index(drop=True), cat_df.iloc[va].reset_index(drop=True)
        num_tr, num_va = num_df.iloc[tr].reset_index(drop=True), num_df.iloc[va].reset_index(drop=True)
        y_tr, y_va = y.iloc[tr].reset_index(drop=True), y.iloc[va].reset_index(drop=True)

        pos=float(y_tr.mean()); neg=1.0-pos
        class_weight={0:1.0, 1:neg/(pos+1e-9)}

        tf.keras.utils.set_random_seed(seed+fold)
        model = build_embed_mlp(cat_tr, num_tr)

        tr_ds = df_to_ds(cat_tr, num_tr, y_tr, batch=batch, shuffle=True, seed=seed+fold)
        va_ds = df_to_ds(cat_va, num_va, y_va, batch=batch, shuffle=False)

        cb = [
            keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=1, min_lr=1e-5),
        ]
        model.fit(tr_ds, validation_data=va_ds, epochs=epochs, verbose=0, callbacks=cb, class_weight=class_weight)

        pred = model.predict(df_to_ds(cat_va, num_va, None, batch=batch), verbose=0).reshape(-1)
        oof[va] = np.nan_to_num(pred, nan=0.5)

    return roc_auc_score(y, oof)

# build selected numeric matrices
numA_sel = numA[selA].copy()
numB_sel = numB[selB].copy()

aucA = quick_oof(catA, numA_sel, y, epochs=12, splits=3, seed=42, batch=512)
aucB = quick_oof(catB, numB_sel, y, epochs=12, splits=3, seed=42, batch=512)

print("DL quick AUC A:", aucA, "DL quick AUC B:", aucB)
USE_A = (aucA >= aucB)
print("✅ Selected option:", "A(reverse+mach)" if USE_A else "B(no reverse+mach)")


DL quick AUC A: 0.7566092269622803 DL quick AUC B: 0.7565507923680754
✅ Selected option: A(reverse+mach)


## 6) 최종 학습 + 제출 (5-fold + seed ensemble)


In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# final config
FINAL_SEEDS = [42, 202, 777]
EPOCHS_SUB = 25
BATCH_SUB = 256
N_SPLITS_SUB = 5

# choose final data
if USE_A:
    cat_final = catA
    num_final = numA[selA].copy()
    tcat_final = tcatA
    tnum_final = tnumA[selA].copy()
else:
    cat_final = catB
    num_final = numB[selB].copy()
    tcat_final = tcatB
    tnum_final = tnumB[selB].copy()

skf = StratifiedKFold(n_splits=N_SPLITS_SUB, shuffle=True, random_state=42)

# OOF (optional, can be slow) — keep for sanity
oof_by_seed = []
for seed in FINAL_SEEDS:
    oof = np.zeros(len(num_final), dtype="float32")
    for fold,(tr,va) in enumerate(skf.split(num_final, y)):
        cat_tr, cat_va = cat_final.iloc[tr].reset_index(drop=True), cat_final.iloc[va].reset_index(drop=True)
        num_tr, num_va = num_final.iloc[tr].reset_index(drop=True), num_final.iloc[va].reset_index(drop=True)
        y_tr, y_va = y.iloc[tr].reset_index(drop=True), y.iloc[va].reset_index(drop=True)

        pos=float(y_tr.mean()); neg=1.0-pos
        class_weight={0:1.0, 1:neg/(pos+1e-9)}

        tf.keras.utils.set_random_seed(seed+fold)
        model = build_embed_mlp(cat_tr, num_tr)

        tr_ds = df_to_ds(cat_tr, num_tr, y_tr, batch=BATCH_SUB, shuffle=True, seed=seed+fold)
        va_ds = df_to_ds(cat_va, num_va, y_va, batch=BATCH_SUB, shuffle=False)

        cb = [
            keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=1, min_lr=1e-5),
        ]
        model.fit(tr_ds, validation_data=va_ds, epochs=EPOCHS_SUB, verbose=0, callbacks=cb, class_weight=class_weight)
        pred = model.predict(df_to_ds(cat_va, num_va, None, batch=BATCH_SUB), verbose=0).reshape(-1)
        oof[va] = np.nan_to_num(pred, nan=0.5)

    oof_by_seed.append(oof)
    print("seed OOF done:", seed, "AUC:", roc_auc_score(y, oof))

oof_ens = np.mean(np.vstack(oof_by_seed), axis=0)
print("✅ Ensemble OOF AUC:", roc_auc_score(y, oof_ens))

# final train on full data per seed and predict test
def train_full_predict(seed):
    tf.keras.utils.set_random_seed(seed)
    model = build_embed_mlp(cat_final, num_final)

    pos=float(y.mean()); neg=1.0-pos
    class_weight={0:1.0, 1:neg/(pos+1e-9)}

    tr_ds = df_to_ds(cat_final, num_final, y, batch=BATCH_SUB, shuffle=True, seed=seed)
    model.fit(tr_ds, epochs=max(10, EPOCHS_SUB//2), verbose=0, class_weight=class_weight)

    pred = model.predict(df_to_ds(tcat_final, tnum_final, None, batch=BATCH_SUB), verbose=0).reshape(-1)
    return np.nan_to_num(pred, nan=0.5)

preds = []
for s in FINAL_SEEDS:
    preds.append(train_full_predict(s))
    print("seed done:", s)

pred_test = np.mean(np.vstack(preds), axis=0).reshape(-1)

submission = sub.copy()
submission["voted"] = pred_test

print("submission:", submission.shape, submission.columns.tolist())
print(submission["voted"].describe())

assert submission.shape == (11383, 2)
assert submission.columns.tolist() == ["index","voted"]
assert float(submission["voted"].min()) >= 0.0 and float(submission["voted"].max()) <= 1.0

out_path = PROJECT_ROOT / "submission_best.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


seed OOF done: 42 AUC: 0.7617914482578695
seed OOF done: 202 AUC: 0.7619252811040627
seed OOF done: 777 AUC: 0.7610051715272811
✅ Ensemble OOF AUC: 0.7633358963541882
seed done: 42
seed done: 202
seed done: 777
submission: (11383, 2) ['index', 'voted']
count    11383.000000
mean         0.525780
std          0.235688
min          0.116620
25%          0.344356
50%          0.434459
75%          0.723689
max          0.995433
Name: voted, dtype: float64
Saved: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/submission_best.csv
