# Feature Engineering + Seed Ensemble


## 0) (필요 시) 설치


In [None]:
# 필요할 때만 실행하세요.
# !pip -q install tensorflow scikit-learn


## 1)  공용 로더 + 데이터 로드


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 220)
pd.set_option('display.width', 180)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() or (p / "README.md").exists():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. vote-AI 루트에 requirements.txt 또는 README.md가 있는지 확인하세요.")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data" / "raw"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test_x.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("train:", train.shape, "test:", test.shape, "sub:", sub.shape)
train.head()


PROJECT_ROOT: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI
DATA_DIR: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/data/raw
train: (45532, 78) test: (11383, 77) sub: (11383, 2)


Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,engnat,familysize,gender,hand,married,race,religion,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,urban,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
0,0,3.0,363,4.0,1370,5.0,997,1.0,1024,2.0,1577,5.0,539,2.0,586,4.0,1095,5.0,1142,4.0,1287,4.0,883,4.0,851,2.0,851,5.0,816,2.0,579,2.0,924,2.0,366,2.0,876,2.0,633,1.0,1115,30s,2,1,4,Female,1,3,White,Other,2,2,2,1,2,1,7,4,4,3,1,2,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1
1,1,5.0,647,5.0,1313,3.0,3387,5.0,2969,1.0,4320,3.0,2190,1.0,826,1.0,4082,5.0,1867,3.0,1264,5.0,2943,4.0,3927,1.0,4329,5.0,1828,1.0,1214,5.0,2414,5.0,1356,1.0,3039,4.0,4304,1.0,1346,20s,4,2,3,Female,1,1,Asian,Hindu,1,1,0,0,1,2,3,4,0,4,3,2,0,0,0,0,1,0,1,1,0,1,1,0,1,0,1,1
2,2,4.0,1623,1.0,1480,1.0,1021,4.0,3374,5.0,1333,1.0,531,4.0,1167,1.0,1016,3.0,2653,2.0,1569,5.0,998,5.0,2547,2.0,918,4.0,2153,2.0,1304,1.0,1131,5.0,937,4.0,1327,1.0,1170,1.0,1409,30s,3,1,3,Male,1,2,White,Other,2,3,1,5,3,4,2,6,1,3,2,1,0,0,1,1,1,0,1,1,0,1,1,1,1,0,1,1
3,3,3.0,504,3.0,2311,4.0,992,3.0,3245,1.0,357,2.0,1519,4.0,159,3.0,2275,5.0,2809,4.0,5614,3.0,3219,4.0,1296,4.0,9046,4.0,1216,4.0,1169,4.0,23868,3.0,581,4.0,8830,4.0,2392,5.0,1312,20s,4,2,0,Female,1,1,Asian,Hindu,2,4,1,1,1,3,1,3,1,3,3,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1
4,4,1.0,927,1.0,707,5.0,556,2.0,1062,1.0,1014,2.0,628,1.0,991,1.0,1259,5.0,1153,5.0,1388,5.0,740,5.0,1181,4.0,547,2.0,575,1.0,754,4.0,1140,5.0,323,5.0,1070,1.0,583,2.0,1889,20s,3,1,2,Male,1,2,White,Agnostic,1,1,1,6,0,2,0,6,2,6,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1


## 2) 스키마 검증


In [2]:
train_cols = [c for c in train.columns if c != 'voted']
test_cols  = list(test.columns)

print('train(no voted) cols:', len(train_cols))
print('test cols          :', len(test_cols))
print('same columns set   :', set(train_cols) == set(test_cols))
print('same column order  :', train_cols == test_cols)


train(no voted) cols: 77
test cols          : 77
same columns set   : True
same column order  : True


## 3) 타깃(y) 인코딩


In [3]:
y = (train['voted'] == 1).astype('int32')
X = train.drop(columns=['voted'])

print("raw voted unique:", sorted(train['voted'].unique()))
print("y counts (1=Yes):")
print(y.value_counts())
print("positive ratio:", float(y.mean()))


raw voted unique: [np.int64(1), np.int64(2)]
y counts (1=Yes):
voted
0    24898
1    20634
Name: count, dtype: int64
positive ratio: 0.45317578845647016


## 4) 그룹 컬럼 탐지


In [8]:
import re

cols = list(X.columns)

# Q 계열: QaA/QaE 형태로 20개씩 존재
q_like = [c for c in cols if re.match(r"^Q", c)]

QA_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]A", c)])
QE_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]E", c)])

TP_cols = sorted([c for c in cols if re.fullmatch(r"tp\d{2}", c)])
WR_cols = sorted([c for c in cols if re.fullmatch(r"wr_?\d{2}", c)])
WF_cols = sorted([c for c in cols if re.fullmatch(r"wf_?\d{2}", c)])

print("Q_A:", len(QA_cols), QA_cols[:10], "..." if len(QA_cols)>10 else "")
print("Q_E:", len(QE_cols), QE_cols[:10], "..." if len(QE_cols)>10 else "")
print("TP :", len(TP_cols), TP_cols)
print("WR :", len(WR_cols), WR_cols[:5],  "..." if len(WR_cols)>5 else "")
print("WF :", len(WF_cols), WF_cols[:5],  "..." if len(WF_cols)>5 else "")


Q_A: 20 ['QaA', 'QbA', 'QcA', 'QdA', 'QeA', 'QfA', 'QgA', 'QhA', 'QiA', 'QjA'] ...
Q_E: 20 ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE'] ...
TP : 10 ['tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08', 'tp09', 'tp10']
WR : 13 ['wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05'] ...
WF : 3 ['wf_01', 'wf_02', 'wf_03'] 


## 5) 문자열 섞임 자동 탐지(예: '30s')


In [None]:
import pandas as pd
import numpy as np

def find_mixed_type_columns(df: pd.DataFrame, numeric_cols: list[str]):
    mixed_cols = []
    examples = {}

    for c in numeric_cols:
        s = df[c]
        coerced = pd.to_numeric(s, errors='coerce')

        if coerced.isna().sum() > s.isna().sum():
            mixed_cols.append(c)
            mask = coerced.isna() & s.notna()
            examples[c] = s.loc[mask].astype(str).head(5).tolist()

    return mixed_cols, examples


# 1️ dtype 기준 1차 분류
object_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = [c for c in X.columns if c not in object_cols]

# 2️ 숫자 컬럼에 문자열 섞임 점검
mixed_cols, mixed_examples = find_mixed_type_columns(X, numeric_cols)

# 3️ 섞임이 있으면 범주형으로 승격
cat_cols_base = sorted(set(object_cols + mixed_cols))
num_cols_base = [c for c in X.columns if c not in cat_cols_base]

# 4️ 확인 출력
print("object cols:", object_cols)
print("mixed  cols:", mixed_cols)
if mixed_cols:
    print("mixed examples:")
    for k, v in list(mixed_examples.items())[:5]:
        print(" -", k, ":", v)

print("\n#cat_base:", len(cat_cols_base), " #num_base:", len(num_cols_base))


object cols: ['age_group', 'gender', 'race', 'religion']
mixed  cols: []

#cat_base: 4  #num_base: 73


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  object_cols = X.select_dtypes(include='object').columns.tolist()


## 6) 파생변수 생성(누수 없는 row-wise)


In [None]:
ZERO_AS_MISSING = ['education', 'engnat', 'hand', 'urban']

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 0=무응답 처리 -> 결측
    for c in ZERO_AS_MISSING:
        if c in df.columns:
            df.loc[df[c] == 0, c] = np.nan

    # 범주형 통일  = 범주형(문자열)은 딥러닝에서 StringLookup → Embedding으로 처리할 거라 타입을 문자열로 통일합니다.
    for c in cat_cols_base:
        if c in df.columns:
            df[c] = df[c].astype(str)

    # 수치형 강제 변환 	=	딥러닝 입력에서 “숫자여야 하는데 문자열이 섞인 사고”를 원천 차단합니다.
    for c in num_cols_base:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # Q_E: log1p + 요약
    if len(QE_cols) > 0:
        qe = df[QE_cols].copy()
        qe = np.log1p(qe.clip(lower=0))

        df["qe_sum"]    = qe.sum(axis=1)
        df["qe_mean"]   = qe.mean(axis=1)
        df["qe_std"]    = qe.std(axis=1)
        df["qe_max"]    = qe.max(axis=1)
        df["qe_min"]    = qe.min(axis=1)
        df["qe_median"] = qe.median(axis=1)

        df["qe_fast_ratio"] = (qe < 1.0).mean(axis=1)
        df["qe_slow_ratio"] = (qe > 4.0).mean(axis=1)

        df[QE_cols] = qe

    # Q_A: 요약 + 스타일
    if len(QA_cols) > 0:
        qa = df[QA_cols].copy().apply(pd.to_numeric, errors='coerce')

        df["qa_mean"] = qa.mean(axis=1)
        df["qa_std"]  = qa.std(axis=1)
        df["qa_min"]  = qa.min(axis=1)
        df["qa_max"]  = qa.max(axis=1)

        df["qa_extreme_ratio"] = ((qa == 1) | (qa == 5)).mean(axis=1)
        df["qa_neutral_ratio"] = (qa == 3).mean(axis=1)

    # TP: 요약 + 간단 차이
    if len(TP_cols) > 0:
        tp = df[TP_cols].copy().apply(pd.to_numeric, errors='coerce')
        df["tp_mean"] = tp.mean(axis=1)
        df["tp_std"]  = tp.std(axis=1)

        if "tp01" in df.columns and "tp06" in df.columns:
            df["tp_extro_minus_intro"] = pd.to_numeric(df["tp01"], errors='coerce') - pd.to_numeric(df["tp06"], errors='coerce')
        if "tp09" in df.columns and "tp04" in df.columns:
            df["tp_stable_minus_anx"]  = pd.to_numeric(df["tp09"], errors='coerce') - pd.to_numeric(df["tp04"], errors='coerce')

    # WR/WF: 요약
    if len(WR_cols) > 0:
        wr = df[WR_cols].copy().apply(pd.to_numeric, errors='coerce')
        df["wr_yes_count"] = wr.sum(axis=1)
        df["wr_yes_ratio"] = wr.mean(axis=1)

    if len(WF_cols) > 0:
        wf = df[WF_cols].copy().apply(pd.to_numeric, errors='coerce')
        df["wf_yes_count"] = wf.sum(axis=1)
        df["wf_yes_ratio"] = wf.mean(axis=1)

    if ("wr_yes_count" in df.columns) and ("wf_yes_count" in df.columns):
        df["wr_minus_wf"] = df["wr_yes_count"] - df["wf_yes_count"]

    # 결측 인디케이터 + NaN → 0
    numeric_cols_now = [c for c in df.columns if df[c].dtype != 'object']
    df["num_missing_count"] = df[numeric_cols_now].isna().sum(axis=1)
    df["num_missing_ratio"] = df[numeric_cols_now].isna().mean(axis=1)
    df[numeric_cols_now] = df[numeric_cols_now].fillna(0)

    return df

X_fe = add_features(X)
T_fe = add_features(test)

print("After FE shapes:", X_fe.shape, T_fe.shape)
print(X_fe.dtypes.value_counts())


After FE shapes: (45532, 102) (11383, 102)
float64    63
int64      35
str         4
Name: count, dtype: int64


## 7) 최종 입력 컬럼 정의


In [15]:
cat_cols = X_fe.select_dtypes(include='object').columns.tolist()
num_cols = [c for c in X_fe.columns if c not in cat_cols]

print("#cat_cols:", len(cat_cols), " #num_cols:", len(num_cols))


#cat_cols: 4  #num_cols: 98


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X_fe.select_dtypes(include='object').columns.tolist()


## 8) 모델: Embedding + MLP (BatchNorm)


In [16]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [17]:
def build_model_from_fold_train(
    X_fold_train: pd.DataFrame,
    cat_cols: list[str],
    num_cols: list[str],
    *,
    emb_dim=24,
    hidden1=512,
    hidden2=256,
    hidden3=128,
    dropout=0.25,
    lr=1e-3,
):
    inputs = {}
    encoded = []

    for c in cat_cols:
        inp = keras.Input(shape=(1,), name=c, dtype=tf.string)
        lookup = layers.StringLookup(output_mode='int')
        lookup.adapt(X_fold_train[c].astype(str).values)  # fold-train only

        vocab_size = lookup.vocabulary_size()
        dim = min(emb_dim, max(2, int(np.ceil(vocab_size**0.25) * 2)))

        x = lookup(inp)
        x = layers.Embedding(vocab_size, dim)(x)
        x = layers.Reshape((dim,))(x)

        inputs[c] = inp
        encoded.append(x)

    if len(num_cols) > 0:
        num_inp = keras.Input(shape=(len(num_cols),), name="num", dtype=tf.float32)
        norm = layers.Normalization()
        norm.adapt(X_fold_train[num_cols].values.astype('float32'))  # fold-train only
        x_num = norm(num_inp)

        inputs["num"] = num_inp
        encoded.append(x_num)

    x = layers.Concatenate()(encoded) if len(encoded) > 1 else encoded[0]

    x = layers.Dense(hidden1)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Dense(hidden2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Dense(hidden3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout)(x)

    out = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=[keras.metrics.AUC(name='auc')],
    )
    return model


### 8-1) tf.data 변환


In [18]:
def df_to_dataset(df: pd.DataFrame, y=None, cat_cols=None, num_cols=None, batch_size=512, shuffle=False, seed=42):
    cat_cols = cat_cols or []
    num_cols = num_cols or []

    features = {}
    for c in cat_cols:
        features[c] = df[c].astype(str).values
    if len(num_cols) > 0:
        features["num"] = df[num_cols].values.astype('float32')

    if y is None:
        ds = tf.data.Dataset.from_tensor_slices(features)
    else:
        ds = tf.data.Dataset.from_tensor_slices((features, y.values.astype('float32')))

    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(df), 10000), seed=seed, reshuffle_each_iteration=True)

    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


## 9) 5-Fold CV + Seed 앙상블


In [19]:
EPOCHS = 25
BATCH  = 512
SEEDS = [41, 42, 43]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_seed_list = []

for seed in SEEDS:
    oof = np.zeros(len(X_fe), dtype='float32')
    scores = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fe, y)):
        X_tr = X_fe.iloc[tr_idx].reset_index(drop=True)
        y_tr = y.iloc[tr_idx].reset_index(drop=True)
        X_va = X_fe.iloc[va_idx].reset_index(drop=True)
        y_va = y.iloc[va_idx].reset_index(drop=True)

        tf.keras.utils.set_random_seed(seed + fold)

        model = build_model_from_fold_train(X_tr, cat_cols=cat_cols, num_cols=num_cols, lr=1e-3, dropout=0.25)

        tr_ds = df_to_dataset(X_tr, y_tr, cat_cols, num_cols, batch_size=BATCH, shuffle=True, seed=seed+fold)
        va_ds = df_to_dataset(X_va, y_va, cat_cols, num_cols, batch_size=BATCH, shuffle=False)

        callbacks = [
            keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=4, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(monitor='val_auc', mode='max', factor=0.5, patience=2, min_lr=1e-5),
        ]

        model.fit(tr_ds, validation_data=va_ds, epochs=EPOCHS, verbose=0, callbacks=callbacks)

        pred_va = model.predict(df_to_dataset(X_va, None, cat_cols, num_cols, batch_size=BATCH), verbose=0).reshape(-1)
        pred_va = np.nan_to_num(pred_va, nan=0.5)

        auc = roc_auc_score(y_va, pred_va)
        oof[va_idx] = pred_va
        scores.append(auc)

        print(f"[seed {seed}] fold {fold} AUC={auc:.6f}")

    oof_seed_list.append(oof)
    print(f"\n[seed {seed}] OOF AUC:", roc_auc_score(y, oof))
    print("-"*60)

oof_ens = np.mean(np.vstack(oof_seed_list), axis=0)
print("\n✅ Ensemble OOF AUC:", roc_auc_score(y, oof_ens))


[seed 41] fold 0 AUC=0.772476
[seed 41] fold 1 AUC=0.762707
[seed 41] fold 2 AUC=0.750513
[seed 41] fold 3 AUC=0.753663
[seed 41] fold 4 AUC=0.761588

[seed 41] OOF AUC: 0.7600277397751615
------------------------------------------------------------
[seed 42] fold 0 AUC=0.770866
[seed 42] fold 1 AUC=0.761768
[seed 42] fold 2 AUC=0.748859
[seed 42] fold 3 AUC=0.753030
[seed 42] fold 4 AUC=0.759847

[seed 42] OOF AUC: 0.7586434605307519
------------------------------------------------------------
[seed 43] fold 0 AUC=0.773247
[seed 43] fold 1 AUC=0.760214
[seed 43] fold 2 AUC=0.751659
[seed 43] fold 3 AUC=0.752975
[seed 43] fold 4 AUC=0.761520

[seed 43] OOF AUC: 0.7595963596998678
------------------------------------------------------------

✅ Ensemble OOF AUC: 0.7620204138419305


## 10) 전체 학습 → test 예측 → 제출 파일 생성


In [None]:
# ===============================
# 최종 제출 파일 생성 (형식 보정)
# ===============================

# pred_test는 이미 올바른 Yes 확률
assert pred_test.ndim == 1

submission = pd.DataFrame({
    "voted": pred_test
})

# 절대 index 저장하지 말 것
out_path = PROJECT_ROOT / "submission_FINAL.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
submission.head()

Saved: /Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/submission_FINAL.csv


Unnamed: 0,voted
0,0.343607
1,0.09642
2,0.571807
3,0.755422
4,0.297271
