In [1]:
import numpy as np
import pandas as pd

SEED = 42
np.random.seed(SEED)

In [29]:
train = pd.read_csv("../../data/raw/train.csv")
test  = pd.read_csv("../../data/raw/test_x.csv")

train["voted_bin"] = (train["voted"] == 2).astype(int)

In [30]:
# Í≥µÌÜµ Ïú†Ìã∏ Ìï®Ïàò (bin, ÏïàÏ†Ñ Ï≤òÎ¶¨)
def three_bin_diff(x):
    return np.where(x <= -2, 0, np.where(x >= 2, 2, 1))

In [31]:
train.columns = train.columns.str.strip()
test.columns  = test.columns.str.strip()

# QxA Ïª¨ÎüºÎßå QxÎ°ú Î¶¨ÎÑ§ÏûÑ (QxEÎäî Í∑∏ÎåÄÎ°ú Îë†)
rename_map = {c: c[:-1] for c in train.columns if c.startswith("Q") and c.endswith("A")}

train = train.rename(columns=rename_map)
test  = test.rename(columns=rename_map)

# 3) ÌôïÏù∏
[c for c in train.columns if c.startswith("Qb")]


['Qb', 'QbE']

In [32]:
# Q_EÏö© ÏÇ¨Ï†Ñ Í≥ÑÏÇ∞ (train Í∏∞Ï§Ä)
qe_cols = [c for c in train.columns if c.endswith("E")]
len(qe_cols), qe_cols[:5]

train_log_time = np.log1p(train[qe_cols])

P10 = train_log_time.stack().quantile(0.10)
P90 = train_log_time.stack().quantile(0.90)

P10, P90

(np.float64(6.124683390894205), np.float64(7.802618063442671))

### 1. Feature Engineering Ìï®Ïàò Ï†ïÏùò (Ïó∞ÏÜçÌòï + Ïù¥ÏßÑÌòï)

##### Q_A (ÌÉúÎèÑ)

In [33]:
def add_qa_features(df):
    df = df.copy()

    df["neg_att"] = df[["Qb","Qc","Qj","Qm","Qo","Qs"]].mean(axis=1)
    df["pos_att"] = df[["Qk","Qq"]].mean(axis=1)
    df["att_gap"] = df["pos_att"] - df["neg_att"]

    cert_cols = ["Qe","Qf","Qh","Qr"]
    df["certainty"] = (df[cert_cols] - 3).abs().mean(axis=1)
    df["mid_ratio"] = (df[cert_cols] == 3).sum(axis=1) / len(cert_cols)

    df["consistency"] = df[
        ["Qb","Qc","Qj","Qm","Qo","Qs","Qk","Qq"]
    ].std(axis=1)

    return df

##### Q_E (ÏùëÎãµÏãúÍ∞Ñ)

In [34]:
def add_qe_features(df, qe_cols, p10, p90):
    df = df.copy()
    log_time = np.log1p(df[qe_cols])

    df["time_mean"]   = log_time.mean(axis=1)
    df["time_median"] = log_time.median(axis=1)
    df["time_std"]    = log_time.std(axis=1)

    df["fast_ratio"] = (log_time < p10).mean(axis=1)
    df["slow_ratio"] = (log_time > p90).mean(axis=1)
    df["outlier_ratio"] = (df[qe_cols] > 10000).mean(axis=1)

    return df

##### TP (Big Five)

In [35]:
def add_tp_features(df):
    df = df.copy()

    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df[tp_cols] = df[tp_cols].replace(0, np.nan)
    df[tp_cols] = df[tp_cols].fillna(df[tp_cols].mean())

    df["extraversion"]      = df["tp01"] - df["tp06"]
    df["agreeableness"]     = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"]       = df["tp04"] - df["tp09"]
    df["openness"]          = df["tp05"] - df["tp10"]

    for col in [
        "extraversion","agreeableness",
        "conscientiousness","neuroticism","openness"
    ]:
        df[col + "_bin"] = three_bin_diff(df[col].values)

    return df

##### Ïù∏Íµ¨ÌÜµÍ≥Ñ (DEMO)

In [36]:
def add_demo_features(df):
    df = df.copy()

    age_map = {
        "10s":1,"20s":2,"30s":3,"40s":4,
        "50s":5,"60s":6,"70s+":7
    }
    df["age_ord"] = df["age_group"].map(age_map)
    df["adult_flag"] = (df["age_group"] != "10s").astype(int)

    df["education_ord"] = df["education"].replace(0, np.nan)
    df["engnat_bin"] = df["engnat"].replace({1:1, 2:0, 0:np.nan})
    df["gender_bin"] = df["gender"].map({"Male":0, "Female":1})

    df["urban_ord"] = df["urban"].replace(0, np.nan)

    fs = df["familysize"].copy()
    fs = fs.mask((fs < 0) | (fs > 1000))
    df["familysize_clip"] = fs.clip(upper=12)
    df["familysize_outlier"] = (fs > 12).fillna(False).astype(int)

    return df


##### Îã®Ïñ¥ Ïù∏ÏßÄ (WR / WF)

In [37]:
def add_word_features(df):
    df = df.copy()

    wr_cols = [f"wr_{i:02d}" for i in range(1,14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1,4)]

    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]

    df["wr_bin"] = pd.cut(
        df["wr_sum"], bins=[-1,4,9,13], labels=[0,1,2]
    ).astype(int)

    return df


##### Ï°∞Ìï© Î≥ÄÏàò (Interaction)

In [38]:
def add_interaction_features(df):
    df = df.copy()

    df["age_edu"] = df["age_ord"] * df["education_ord"]
    df["adult_high_edu"] = (
        (df["adult_flag"]==1) & (df["education_ord"]>=3)
    ).astype(int)

    df["edu_cred"] = df["education_ord"] * df["word_credibility"]
    df["att_strength"] = df["att_gap"] * df["certainty"]
    df["conscientious_att"] = df["conscientiousness"] * df["att_gap"]

    return df


In [None]:
### Feature Engineering Ìïú Î≤àÏóê Ï†ÅÏö©

def build_features(df):
    df = add_demo_features(df)       
    df = add_qa_features(df)
    df = add_qe_features(df, qe_cols, P10, P90)
    df = add_tp_features(df)
    df = add_word_features(df)
    df = add_interaction_features(df)  
    return df

train_fe = build_features(train)
test_fe  = build_features(test)

# word_credibility ÍπåÎ®πÏñ¥ÏÑú Ï∂îÍ∞Ä,,
q1, q2 = train_fe["word_credibility"].quantile([1/3, 2/3]).values

train_fe["cred_bin"] = np.where(
    train_fe["word_credibility"] <= q1, 0,
    np.where(train_fe["word_credibility"] <= q2, 1, 2)
)

test_fe["cred_bin"] = np.where(
    test_fe["word_credibility"] <= q1, 0,
    np.where(test_fe["word_credibility"] <= q2, 1, 2)
)


### 2. Ïó∞ÏÜçÌòï Î≥ÄÏàò Îã®Î≥ÄÎüâ Î∂ÑÏÑù

In [None]:
from scipy.stats import spearmanr

# Ïó∞ÏÜçÌòï ÌîºÏ≤òÎßå ÏßÄÏ†ï
cont_features = [
    # Q_A
    "neg_att","pos_att","att_gap","certainty","mid_ratio","consistency",
    
    # Q_E
    "time_mean","time_median","time_std","fast_ratio","slow_ratio","outlier_ratio",
    
    # TP
    "extraversion","agreeableness","conscientiousness",
    "neuroticism","openness",
    
    # Word
    "wr_sum","wf_sum","word_credibility",
    
    # Demo (Ïó∞ÏÜç/ÏàúÏÑúÌòï Ï∑®Í∏â)
    "age_ord","education_ord","urban_ord","familysize_clip",
    
    # Interaction
    "age_edu","edu_cred","att_strength","conscientious_att"
]

In [43]:
# spearman & mean diff Íµ¨ÌïòÍ∏∞
rows = []

for col in cont_features:
    x = train_fe[col]
    y = train_fe["voted_bin"]

    # Spearman (ÏàúÏÑú/ÎπÑÏÑ†Ìòï ÎåÄÏùë)
    corr, pval = spearmanr(x, y, nan_policy="omit")

    # Í∑∏Î£πÎ≥Ñ ÌèâÍ∑† (Ìï¥ÏÑùÏö©)
    mean_0 = x[y == 0].mean()  # Ìà¨Ìëú Ìï®
    mean_1 = x[y == 1].mean()  # Ìà¨Ìëú Ïïà Ìï®
    diff = mean_1 - mean_0

    rows.append({
        "feature": col,
        "spearman_corr": corr,
        "p_value": pval,
        "mean_vote_yes": mean_0,
        "mean_vote_no": mean_1,
        "mean_diff(no-yes)": diff
    })

univar_cont = pd.DataFrame(rows).sort_values(
    by="spearman_corr", key=lambda x: x.abs(), ascending=False
)

univar_cont.head(15)

Unnamed: 0,feature,spearman_corr,p_value,mean_vote_yes,mean_vote_no,mean_diff(no-yes)
24,age_edu,-0.406469,0.0,8.594548,5.050484,-3.544063
20,age_ord,-0.379336,0.0,2.883339,1.96344,-0.919899
21,education_ord,-0.338819,0.0,2.910804,2.295792,-0.615011
25,edu_cred,-0.33249,0.0,27.207487,20.147957,-7.05953
17,wr_sum,-0.158898,3.464809e-255,9.646312,8.969917,-0.676395
19,word_credibility,-0.154508,3.1424110000000004e-241,9.263691,8.625713,-0.637978
2,att_gap,-0.13207,3.0301989999999997e-176,0.223935,-0.288892,-0.512827
26,att_strength,-0.128953,4.714876e-168,0.201985,-0.537203,-0.739188
1,pos_att,-0.128612,3.625164e-167,3.48975,3.192485,-0.297265
0,neg_att,0.10638,1.049847e-114,3.265815,3.481377,0.215562


üî•ÏµúÏÉÅÏúÑ (Î¨¥Ï°∞Í±¥ Ïú†ÏßÄ)

age_edu

age_ord

education_ord

edu_cred

‚úÖ Í∞ïÌïú 2Íµ∞

wr_sum

word_credibility

att_gap

att_strength

pos_att

üü° Î≥¥Ï°∞

neg_att

fast_ratio

TP ÏÑ±Í≤© Î≥ÄÏàòÎì§

### 2. Ïù¥ÏßÑÌòï Î≥ÄÏàò Îã®Î≥ÄÎüâ Î∂ÑÏÑù

In [44]:
bin_features = [
    "adult_flag",
    "adult_high_edu",
    "gender_bin",
    "engnat_bin",
    "familysize_outlier",
    "cred_bin"  # 0/1
]

rows = []

for col in bin_features:
    grp = train_fe.groupby(col)["voted_bin"].agg(
        voted_no_rate="mean",
        n="size"
    )
    
    # 0 vs 1 Ï∞®Ïù¥ (1 - 0)
    rate0 = grp.loc[0, "voted_no_rate"] if 0 in grp.index else np.nan
    rate1 = grp.loc[1, "voted_no_rate"] if 1 in grp.index else np.nan
    diff = rate1 - rate0
    
    rows.append({
        "feature": col,
        "voted_no_rate_0": rate0,
        "voted_no_rate_1": rate1,
        "diff(1-0)": diff,
        "n_0": grp.loc[0, "n"] if 0 in grp.index else np.nan,
        "n_1": grp.loc[1, "n"] if 1 in grp.index else np.nan,
    })

univar_bin = pd.DataFrame(rows).sort_values(
    by="diff(1-0)", key=lambda x: x.abs(), ascending=False
)

univar_bin


Unnamed: 0,feature,voted_no_rate_0,voted_no_rate_1,diff(1-0),n_0,n_1
0,adult_flag,0.837214,0.415014,-0.4222,14215,31317
1,adult_high_edu,0.693837,0.386684,-0.307153,23739,21793
4,familysize_outlier,0.546925,0.444444,-0.102481,45487,45
5,cred_bin,0.631933,0.535516,-0.096417,15739,20216
3,engnat_bin,0.571394,0.537397,-0.033997,12431,33024
2,gender_bin,0.553206,0.539573,-0.013633,24217,21315


‚úÖ Í∞ïÎ†• Ïú†ÏßÄ (ÌôïÏ†ï)

adult_flag

adult_high_edu

üü° Î≥¥Î•ò (importance Î≥¥Í≥† Í≤∞Ï†ï)

familysize_outlier

cred_bin

‚ùå Ï†úÍ±∞ ÌõÑÎ≥¥

gender_bin

engnat_bin

### 3. Ïó∞ÏÜçÌòï + Ïù¥ÏßÑÌòï Î≥ÄÏàò feature importance Íµ¨ÌïòÍ∏∞
- Autogluon Ïù¥Ïö©

Ïù¥ÏßÑÌòï

adult_flag adult_high_edu 

familysize_outlier cred_bin 

Ïó∞ÏÜçÌòï

age_edu age_ord education_ord edu_cred 

wr_sum word_credibility att_gap att_strength pos_att


In [46]:
from autogluon.tabular import TabularPredictor

features = [
    # Ïù¥ÏßÑÌòï
    "adult_flag", "adult_high_edu", "familysize_outlier", "cred_bin",
    # Ïó∞ÏÜçÌòï
    "age_edu", "age_ord", "education_ord", "edu_cred",
    "wr_sum", "word_credibility",
    "att_gap", "att_strength", "pos_att"
]

train_ag = train_fe[features + ["voted_bin"]].copy()


# ÌïôÏäµ
predictor = TabularPredictor(
    label="voted_bin",
    problem_type="binary",
    eval_metric="roc_auc"
).fit(
    train_data=train_ag,
    presets="good_quality"
)

# Î¶¨ÎçîÎ≥¥Îìú ÌôïÏù∏
leaderboard = predictor.leaderboard(train_ag, silent=True)
leaderboard.head(15)

  from .autonotebook import tqdm as notebook_tqdm
No path specified. Models will be saved in: "AutogluonModels/ag-20260129_042724"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:29 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       2.83 GB / 16.00 GB (17.7%)
Disk Space Avail:   226.13 GB / 460.43 GB (49.1%)
Presets specified: ['good_quality']
Using hyperparameters preset: hyperparameters='light'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs 

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini_BAG_L1,0.900433,0.724399,roc_auc,0.297063,0.850035,1.066522,0.297063,0.850035,1.066522,1,True,3
1,RandomForestGini_BAG_L1_FULL,0.900433,,roc_auc,0.301837,0.850035,1.066522,0.301837,0.850035,1.066522,1,True,15
2,RandomForestEntr_BAG_L1,0.892128,0.725654,roc_auc,0.283214,0.859766,1.001825,0.283214,0.859766,1.001825,1,True,4
3,RandomForestEntr_BAG_L1_FULL,0.892128,,roc_auc,0.309712,0.859766,1.001825,0.309712,0.859766,1.001825,1,True,16
4,ExtraTreesGini_BAG_L1,0.866419,0.729071,roc_auc,0.297885,0.849369,0.71579,0.297885,0.849369,0.71579,1,True,6
5,ExtraTreesGini_BAG_L1_FULL,0.866419,,roc_auc,0.3307,0.849369,0.71579,0.3307,0.849369,0.71579,1,True,18
6,ExtraTreesEntr_BAG_L1,0.864973,0.729039,roc_auc,0.325135,0.844699,0.662934,0.325135,0.844699,0.662934,1,True,7
7,ExtraTreesEntr_BAG_L1_FULL,0.864973,,roc_auc,0.332374,0.844699,0.662934,0.332374,0.844699,0.662934,1,True,19
8,LightGBMLarge_BAG_L1_FULL,0.797957,,roc_auc,0.058741,,3.040264,0.058741,,3.040264,1,True,23
9,LightGBM_BAG_L1_FULL,0.758754,,roc_auc,0.028101,,0.753925,0.028101,,0.753925,1,True,14


In [47]:
# Feature importance ÎΩëÍ∏∞ (TopÎßå)
fi = predictor.feature_importance(train_ag)
fi.sort_values("importance", ascending=False)

Computing feature importance via permutation shuffling for 13 features using 5000 rows with 5 shuffle sets...
	5.07s	= Expected runtime (1.01s per shuffle set)
	3.51s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
adult_flag,0.028832,0.005046,0.000108,5,0.039223,0.018442
wr_sum,0.026156,0.004076,6.9e-05,5,0.034549,0.017764
age_ord,0.019679,0.003748,0.000151,5,0.027397,0.011962
education_ord,0.019185,0.003328,0.000105,5,0.026038,0.012331
age_edu,0.015421,0.00638,0.002837,5,0.028558,0.002285
word_credibility,0.008819,0.001857,0.000222,5,0.012642,0.004996
att_gap,0.002349,0.001352,0.008878,5,0.005131,-0.000434
edu_cred,0.002185,0.000611,0.000663,5,0.003444,0.000927
att_strength,0.001678,0.000598,0.00165,5,0.002909,0.000446
cred_bin,0.001616,0.000749,0.004247,5,0.003159,7.4e-05


### 3. Î≤îÏ£ºÌòï Î≥ÄÏàò Îã®Î≥ÄÎüâ Î∂ÑÏÑù Î∞è Î¶¨ÏΩîÎî© & feature engineering

In [49]:
# raw ÏÉÅÌÉúÎ°ú Îã®Î≥ÄÎüâ Î∂ÑÏÑù
raw_cat_cols = [
    "engnat",     # 1,2,0
    "gender",     # Male, Female
    "hand",       # 1,2,3,0
    "married",    # 1,2,3,0
    "race",       # string
    "religion",   # string
    "urban"       # 1,2,3,0
]

def cat_univariate_raw(df, col, target="voted_bin"):
    return (
        df
        .groupby(col)[target]
        .agg(
            voted_no_rate="mean",
            n="count"
        )
        .reset_index()
        .sort_values("voted_no_rate", ascending=False)
    )

for col in raw_cat_cols:
    print(f"\n=== {col} ===")
    display(cat_univariate_raw(train, col))




=== engnat ===


Unnamed: 0,engnat,voted_no_rate,n
0,0,0.623377,77
2,2,0.571394,12431
1,1,0.537397,33024



=== gender ===


Unnamed: 0,gender,voted_no_rate,n
1,Male,0.553206,24217
0,Female,0.539573,21315



=== hand ===


Unnamed: 0,hand,voted_no_rate,n
0,0,0.63354,161
3,3,0.592844,1621
1,1,0.545317,39058
2,2,0.540494,4692



=== married ===


Unnamed: 0,married,voted_no_rate,n
0,0,0.677419,93
1,1,0.629572,31550
3,3,0.422454,3830
2,2,0.333433,10059



=== race ===


Unnamed: 0,race,voted_no_rate,n
0,Arab,0.709402,351
1,Asian,0.704565,6834
5,Other,0.622402,4330
2,Black,0.597786,2168
4,Native American,0.587591,548
3,Indigenous Australian,0.54717,53
6,White,0.495776,31248



=== religion ===


Unnamed: 0,religion,voted_no_rate,n
9,Muslim,0.687081,1192
11,Sikh,0.641026,117
2,Buddhist,0.621176,850
4,Christian_Mormon,0.619159,428
7,Hindu,0.613016,1429
10,Other,0.582809,4770
1,Atheist,0.572704,10192
0,Agnostic,0.542602,9624
5,Christian_Other,0.539615,5137
3,Christian_Catholic,0.526512,6431



=== urban ===


Unnamed: 0,urban,voted_no_rate,n
3,3,0.580402,17767
2,2,0.541545,18534
1,1,0.492872,8909
0,0,0.490683,322


engnat
married
race
religion
urban
Ïù¥Ïô∏ hand, gender ÏÇ¨Ïö© X

In [50]:
# ÏÇ¨Ïö©Ìï† Ïª¨Îüº Î¶¨ÏΩîÎî©
def recode_married(x):
    # original: 0=NA/Other, 1=Single, 2=Married, 3=Divorced/Widowed
    if pd.isna(x):
        return "NA"
    try:
        x = int(x)
    except:
        return "NA"
    if x == 1:
        return "Single"
    if x == 2:
        return "Married"
    if x == 3:
        return "Divorced_Widowed"
    return "Other_NA"   # 0 or anything else


def recode_race(x):
    # original: strings like White, Asian, Black, Arab, Native American, Indigenous Australian, Other
    if pd.isna(x):
        return "NA"
    x = str(x).strip()
    if x in ["White", "Asian", "Black"]:
        return x
    # everything else -> Other (Arab, Native American, Indigenous Australian, Other, etc.)
    return "Other"


def recode_religion(x):
    # original: strings like Christian_Catholic, Christian_Protestant, Jewish, Muslim, Atheist, Agnostic, etc.
    if pd.isna(x):
        return "NA"
    x = str(x).strip()

    # Christian bucket
    if x.startswith("Christian"):
        return "Christian"

    # Atheist / Agnostic bucket
    if x in ["Atheist", "Agnostic"]:
        return "Atheist_Agnostic"

    # Keep major religions separately
    if x in ["Jewish", "Muslim"]:
        return x

    # Everything else -> Other (Buddhist, Hindu, Sikh, Mormon, Other, etc.)
    return "Other"


# Apply to train/test (raw or fe; both are fine as long as columns exist)
for df in [train, test]:
    df["married_cat"]  = df["married"].apply(recode_married)
    df["race_cat"]     = df["race"].apply(recode_race)
    df["religion_cat"] = df["religion"].apply(recode_religion)

# quick check (train)
display(train.groupby("married_cat")["voted_bin"].agg(voted_no_rate="mean", n="count").sort_values("voted_no_rate", ascending=False))
display(train.groupby("race_cat")["voted_bin"].agg(voted_no_rate="mean", n="count").sort_values("voted_no_rate", ascending=False))
display(train.groupby("religion_cat")["voted_bin"].agg(voted_no_rate="mean", n="count").sort_values("voted_no_rate", ascending=False))

Unnamed: 0_level_0,voted_no_rate,n
married_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
Other_NA,0.677419,93
Single,0.629572,31550
Divorced_Widowed,0.422454,3830
Married,0.333433,10059


Unnamed: 0_level_0,voted_no_rate,n
race_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
Asian,0.704565,6834
Other,0.623817,5282
Black,0.597786,2168
White,0.495776,31248


Unnamed: 0_level_0,voted_no_rate,n
religion_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
Muslim,0.687081,1192
Other,0.594334,7166
Atheist_Agnostic,0.558084,19816
Christian,0.506313,16871
Jewish,0.449692,487


In [80]:
# ÎÇòÎ®∏ÏßÄ 
df["engnat_bin"] = df["engnat"].replace({1: 1, 2: 0, 0: np.nan})
df["urban_ord"] = df["urban"].replace(0, np.nan)



In [56]:
# Ïó∞ÏÜçÌòï + Ïù¥ÏßÑÌòï + Î≤îÏ£ºÌòï Ïãπ Î™®ÏïÑÏ£ºÍ∏∞
def add_cat_features(df):
    df = df.copy()

    df["married_cat"]  = df["married"].apply(recode_married)
    df["race_cat"]     = df["race"].apply(recode_race)
    df["religion_cat"] = df["religion"].apply(recode_religion)

    return df

def build_features(df):
    df = add_qa_features(df)
    df = add_qe_features(df, qe_cols, P10, P90)
    df = add_tp_features(df)
    df = add_demo_features(df)
    df = add_word_features(df)
    df = add_interaction_features(df)
    df = add_cat_features(df)
    return df

### 4. ÏµúÏ¢Ö feature importance Íµ¨ÌïòÍ∏∞

In [58]:
train_fe = build_features(train)
test_fe  = build_features(test)

feature_cols = [
    # continuous / binary
    "age_ord", "education_ord", "age_edu",
    "wr_sum", "word_credibility",
    "att_gap", "edu_cred", "att_strength", "pos_att",
    "adult_flag", "adult_high_edu", "engnat_bin",

    # categorical
    "married_cat", "race_cat", "religion_cat",

    # ordinal
    "urban_ord"
]

train_ag = train_fe[feature_cols + ["voted_bin"]].copy()

from autogluon.tabular import TabularPredictor

predictor = TabularPredictor(
    label="voted_bin",
    problem_type="binary",
    eval_metric="roc_auc"
).fit(
    train_data=train_ag,
    presets="good_quality"
)



No path specified. Models will be saved in: "AutogluonModels/ag-20260129_051055"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:29 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       3.72 GB / 16.00 GB (23.3%)
Disk Space Avail:   222.43 GB / 460.43 GB (48.3%)
Presets specified: ['good_quality']
Using hyperparameters preset: hyperparameters='light'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model re

In [59]:
# Feature importance ÎΩëÍ∏∞ (TopÎßå)
fi = predictor.feature_importance(train_ag)
fi.sort_values("importance", ascending=False)


Computing feature importance via permutation shuffling for 16 features using 5000 rows with 5 shuffle sets...
	20.12s	= Expected runtime (4.02s per shuffle set)
	7.17s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
race_cat,0.03464,0.001954,1.20952e-06,5,0.038663,0.030617
education_ord,0.029697,0.004186,4.61243e-05,5,0.038316,0.021079
adult_flag,0.018253,0.002777,6.2316e-05,5,0.02397,0.012536
married_cat,0.012661,0.003189,0.0004448789,5,0.019228,0.006094
engnat_bin,0.011717,0.000786,2.419311e-06,5,0.013336,0.010098
wr_sum,0.010936,0.002376,0.0002512322,5,0.015828,0.006045
word_credibility,0.010024,0.001699,9.538608e-05,5,0.013522,0.006525
religion_cat,0.009909,0.002035,0.0002021416,5,0.0141,0.005718
age_edu,0.009239,0.001953,0.0002257677,5,0.01326,0.005219
age_ord,0.008382,0.00146,0.000106155,5,0.011388,0.005376


Tier 1 (ÌïµÏã¨ Í≥®Í≤©)

race_cat

education_ord

adult_flag

married_cat

engnat_bin

wr_sum

word_credibility

Tier 2 (ÏÑ§Î™Ö Î≥¥Í∞ï)

religion_cat

age_edu

age_ord

edu_cred

pos_att

Tier 3 (Îî•Îü¨ÎãùÏóêÏÑú Ïû¨ÌèâÍ∞Ä)

att_gap

att_strength

adult_high_edu

urban_ord

In [81]:
"""
üèÜ DCN-V2 ÏµúÏ¢Ö Ï†úÏ∂úÏö© (Î≤ÑÍ∑∏/ÎàÑÏàò/ÏïàÏ†ïÏÑ± ÏàòÏ†ï ÏôÑÎ£å)

Fix Summary
1) Optuna param key mismatch Ìï¥Í≤∞: suggest Ïù¥Î¶ÑÍ≥º best_params ÌÇ§ ÌÜµÏùº
2) FoldÎ≥Ñ StandardScaler Ï†ÅÏö© (ÎàÑÏàò Ï†úÍ±∞)
3) DCN CrossLayer ÏàòÏãù ÏàòÏ†ï (biasÍ∞Ä x0Ïóê Í≥±Ìï¥ÏßÄÎèÑÎ°ù)
4) Loss ÏòµÏÖòÌôî: BCEWithLogitsLoss(pos_weight) vs FocalLoss (OptunaÏóêÏÑú ÏÑ†ÌÉù)
5) BatchNorm ÏïàÏ†ïÌôî: train loader drop_last=True
"""

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_SEED = 42
np.random.seed(BASE_SEED)
torch.manual_seed(BASE_SEED)

print("="*80)
print("üèÜ DCN-V2 ÏµúÏ¢Ö Ï†úÏ∂úÏö© (Î≤ÑÍ∑∏/ÎàÑÏàò/ÏïàÏ†ïÏÑ± ÏàòÏ†ï ÏôÑÎ£å)")
print("="*80)

# ==================== 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú & Feature Engineering ====================

print("\nüìÇ Îç∞Ïù¥ÌÑ∞ Î°úÎìú...")

train = pd.read_csv("../../data/raw/train.csv")
test  = pd.read_csv("../../data/raw/test_x.csv")

train["voted_bin"] = (train["voted"] == 2).astype(int)

print(f"Train: {train.shape}, Test: {test.shape}")
pos_rate = train["voted_bin"].mean()
print(f"Ìà¨Ìëú ÏïàÌï®(positive) ÎπÑÏú®: {pos_rate:.3f}")
print(f"ÌÅ¥ÎûòÏä§ ÎπÑÏú® (neg:pos) ‚âà {(1-pos_rate)/(pos_rate+1e-9):.2f}:1")

train.columns = train.columns.str.strip()
test.columns  = test.columns.str.strip()

rename_map = {c: c[:-1] for c in train.columns if c.startswith("Q") and c.endswith("A")}
train = train.rename(columns=rename_map)
test  = test.rename(columns=rename_map)

qe_cols = [c for c in train.columns if c.endswith("E")]
train_log_time = np.log1p(train[qe_cols])
P10 = train_log_time.stack().quantile(0.10)
P90 = train_log_time.stack().quantile(0.90)

def add_qa_features(df):
    df = df.copy()
    df["neg_att"] = df[["Qb","Qc","Qj","Qm","Qo","Qs"]].mean(axis=1)
    df["pos_att"] = df[["Qk","Qq"]].mean(axis=1)
    df["att_gap"] = df["pos_att"] - df["neg_att"]
    cert_cols = ["Qe","Qf","Qh","Qr"]
    df["certainty"] = (df[cert_cols] - 3).abs().mean(axis=1)
    df["mid_ratio"] = (df[cert_cols] == 3).sum(axis=1) / len(cert_cols)
    df["consistency"] = df[["Qb","Qc","Qj","Qm","Qo","Qs","Qk","Qq"]].std(axis=1)
    return df

def add_qe_features(df, qe_cols, p10, p90):
    df = df.copy()
    log_time = np.log1p(df[qe_cols])
    df["time_mean"] = log_time.mean(axis=1)
    df["time_median"] = log_time.median(axis=1)
    df["time_std"] = log_time.std(axis=1)
    df["fast_ratio"] = (log_time < p10).mean(axis=1)
    df["slow_ratio"] = (log_time > p90).mean(axis=1)
    df["outlier_ratio"] = (df[qe_cols] > 10000).mean(axis=1)
    return df

def add_tp_features(df):
    df = df.copy()
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df[tp_cols] = df[tp_cols].replace(0, np.nan)
    df[tp_cols] = df[tp_cols].fillna(df[tp_cols].mean())
    df["extraversion"] = df["tp01"] - df["tp06"]
    df["agreeableness"] = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"] = df["tp04"] - df["tp09"]
    df["openness"] = df["tp05"] - df["tp10"]
    return df

def add_demo_features(df):
    df = df.copy()
    age_map = {"10s":1,"20s":2,"30s":3,"40s":4,"50s":5,"60s":6,"70s+":7}
    df["age_ord"] = df["age_group"].map(age_map)
    df["adult_flag"] = (df["age_group"] != "10s").astype(int)
    df["education_ord"] = df["education"].replace(0, np.nan)
    df["engnat_bin"] = df["engnat"].replace({1:1, 2:0, 0:np.nan})
    df["urban_ord"] = df["urban"].replace(0, np.nan)
    return df

def add_word_features(df):
    df = df.copy()
    wr_cols = [f"wr_{i:02d}" for i in range(1,14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1,4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    return df

def add_interaction_features(df):
    df = df.copy()
    df["age_edu"] = df["age_ord"] * df["education_ord"]
    df["adult_high_edu"] = ((df["adult_flag"]==1) & (df["education_ord"]>=3)).astype(int)
    df["edu_cred"] = df["education_ord"] * df["word_credibility"]
    return df

def recode_married(x):
    if pd.isna(x): return "NA"
    try: x = int(x)
    except: return "NA"
    if x == 1: return "Single"
    if x == 2: return "Married"
    if x == 3: return "Divorced_Widowed"
    return "Other_NA"

def recode_race(x):
    if pd.isna(x): return "NA"
    x = str(x).strip()
    if x in ["White", "Asian", "Black"]: return x
    return "Other"

def recode_religion(x):
    if pd.isna(x): return "NA"
    x = str(x).strip()
    if x.startswith("Christian"): return "Christian"
    if x in ["Atheist", "Agnostic"]: return "Atheist_Agnostic"
    if x in ["Jewish", "Muslim"]: return x
    return "Other"

def add_cat_features(df):
    df = df.copy()
    df["married_cat"] = df["married"].apply(recode_married)
    df["race_cat"] = df["race"].apply(recode_race)
    df["religion_cat"] = df["religion"].apply(recode_religion)
    return df

def build_features(df):
    df = add_qa_features(df)
    df = add_qe_features(df, qe_cols, P10, P90)
    df = add_tp_features(df)
    df = add_demo_features(df)
    df = add_word_features(df)
    df = add_interaction_features(df)
    df = add_cat_features(df)
    return df

print("üî® Feature Engineering...")
train_fe = build_features(train)
test_fe  = build_features(test)
print(f"‚úÖ ÏôÑÎ£å: {train_fe.shape}, {test_fe.shape}")

# ==================== 2. Feature ÏÑ†ÌÉù ====================

NUM_COLS = [
    "age_edu", "education_ord", "adult_flag",
    "wr_sum", "word_credibility", "edu_cred",
    "pos_att", "neg_att", "certainty", "consistency",
    "time_mean", "time_std",
    "urban_ord", "engnat_bin"
]
CAT_COLS = ["race_cat", "married_cat", "religion_cat"]
TARGET = "voted_bin"

print(f"\nüìä ÏµúÏ¢Ö ÌîºÏ≤ò: {len(NUM_COLS)} numeric + {len(CAT_COLS)} cat = {len(NUM_COLS)+len(CAT_COLS)} total")

# ==================== 3. ÏïàÏ†ÑÌïú Categorical Encoding ====================

print("\nüîß Categorical Encoding (Unseen ÎåÄÏùë)...")

for df in [train_fe, test_fe]:
    for c in NUM_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in CAT_COLS:
        df[c] = df[c].astype(str).fillna("NA")

cat_maps = {}
cat_dims = []
for c in CAT_COLS:
    all_values = set(train_fe[c].unique()) | set(test_fe[c].unique())
    vocab = ["<UNK>"] + sorted(all_values)
    cat_maps[c] = {v: i for i, v in enumerate(vocab)}
    cat_dims.append(len(vocab))
    print(f"   {c}: {len(vocab)} categories")

def safe_encode_cat(series, mapping):
    return series.map(lambda x: mapping.get(x, 0)).astype(int).values

# Ï†ÑÏó≠ median (foldÎ≥Ñ ÎàÑÏàòÎäî scalerÏóêÏÑú ÎßâÍ≥†, medianÏùÄ ÏÉÅÎåÄÏ†ÅÏúºÎ°ú ÏòÅÌñ• Ï†ÅÏùå)
# Îçî ÏóÑÍ≤©Ìûà ÌïòÎ†§Î©¥ foldÎßàÎã§ medianÎèÑ train foldÏóêÏÑú Í≥ÑÏÇ∞ÌïòÎèÑÎ°ù Î∞îÍøÄ Ïàò ÏûàÏùå.
global_num_median = train_fe[NUM_COLS].median()

X_num_all = train_fe[NUM_COLS].fillna(global_num_median).values
X_cat_all = np.stack([safe_encode_cat(train_fe[c], cat_maps[c]) for c in CAT_COLS], axis=1)
y_all = train_fe[TARGET].values.astype(np.float32)

test_num_all = test_fe[NUM_COLS].fillna(global_num_median).values
test_cat_all = np.stack([safe_encode_cat(test_fe[c], cat_maps[c]) for c in CAT_COLS], axis=1)

# ==================== 4. DCN-V2 Î™®Îç∏ ====================

class CrossLayer(nn.Module):
    """DCN-V2 Cross Layer (ÏàòÏãù ÏàòÏ†ï: biasÍ∞Ä x0Ïóê Í≥±Ìï¥ÏßÄÎèÑÎ°ù)"""
    def __init__(self, input_dim):
        super().__init__()
        self.weight = nn.Linear(input_dim, input_dim, bias=False)
        self.bias = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x0, x):
        # x_{l+1} = x0 ‚äô (W x + b) + x
        return x0 * (self.weight(x) + self.bias) + x

class DeepCrossNetworkV2(nn.Module):
    def __init__(self, num_dim, cat_dims, hidden=256, n_cross=3, dropout=0.3):
        super().__init__()

        self.embs = nn.ModuleList()
        emb_out_dim = 0
        for d in cat_dims:
            e = min(50, max(8, int(round(2.0 * (d ** 0.56)))))
            self.embs.append(nn.Embedding(d, e))
            emb_out_dim += e

        input_dim = num_dim + emb_out_dim

        self.input_bn = nn.BatchNorm1d(input_dim)

        self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(n_cross)])

        self.deep = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.BatchNorm1d(hidden),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden, hidden // 2),
            nn.BatchNorm1d(hidden // 2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden // 2, hidden // 4),
            nn.BatchNorm1d(hidden // 4),
            nn.ReLU(),
            nn.Dropout(dropout / 2),
        )

        self.output = nn.Linear(input_dim + hidden // 4, 1)

    def forward(self, x_num, x_cat):
        cat_emb = torch.cat([emb(x_cat[:, i]) for i, emb in enumerate(self.embs)], dim=1)
        x = torch.cat([x_num, cat_emb], dim=1)
        x = self.input_bn(x)

        x0 = x
        x_cross = x
        for cross in self.cross_layers:
            x_cross = cross(x0, x_cross)

        x_deep = self.deep(x)
        x_final = torch.cat([x_cross, x_deep], dim=1)
        return self.output(x_final).squeeze(1)

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.3, gamma=2.5):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        bce = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce)
        loss = self.alpha * (1 - pt) ** self.gamma * bce
        return loss.mean()

class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

# ==================== 5. Optuna ÌäúÎãù (FoldÎ≥Ñ scaler Ï†ÅÏö© + loss ÏòµÏÖòÌôî) ====================

print("\n" + "="*80)
print("üîç Optuna ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù (25 trials)")
print("="*80)

def compute_pos_weight(y_train_np):
    pos = float(y_train_np.sum())
    neg = float(len(y_train_np) - pos)
    # pos_weight = neg/pos
    return torch.tensor([neg / (pos + 1e-9)], device=DEVICE)

def objective(trial):
    params = {
        # Íµ¨Ï°∞
        "hidden": trial.suggest_categorical("hidden", [192, 256, 384]),
        "n_cross": trial.suggest_int("n_cross", 2, 4),
        "dropout": trial.suggest_float("dropout", 0.2, 0.4),

        # ÏµúÏ†ÅÌôî
        "lr": trial.suggest_float("lr", 3e-4, 3e-2, log=True),
        "wd": trial.suggest_float("wd", 1e-6, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [256, 512]),

        # loss
        "loss_type": trial.suggest_categorical("loss_type", ["bce", "focal"]),
        "focal_alpha": trial.suggest_float("focal_alpha", 0.2, 0.4),
        "focal_gamma": trial.suggest_float("focal_gamma", 2.0, 3.0),
    }

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=BASE_SEED)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(kfold.split(X_num_all, y_all), 1):
        X_tr_raw, X_va_raw = X_num_all[tr_idx], X_num_all[va_idx]
        Xc_tr, Xc_va = X_cat_all[tr_idx], X_cat_all[va_idx]
        y_tr, y_va = y_all[tr_idx], y_all[va_idx]

        # ‚úÖ FoldÎ≥Ñ scaler (ÎàÑÏàò Ï†úÍ±∞)
        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X_tr_raw)
        X_va = scaler.transform(X_va_raw)

        dl_tr = DataLoader(
            TabDataset(X_tr, Xc_tr, y_tr),
            batch_size=params["batch_size"],
            shuffle=True,
            drop_last=True,   # ‚úÖ BN ÏïàÏ†ïÌôî
        )
        dl_va = DataLoader(
            TabDataset(X_va, Xc_va, y_va),
            batch_size=512,
            shuffle=False,
        )

        model = DeepCrossNetworkV2(
            num_dim=len(NUM_COLS),
            cat_dims=cat_dims,
            hidden=params["hidden"],
            n_cross=params["n_cross"],
            dropout=params["dropout"],
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=params["lr"], weight_decay=params["wd"])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

        if params["loss_type"] == "bce":
            pos_weight = compute_pos_weight(y_tr)
            criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            criterion = FocalLoss(alpha=params["focal_alpha"], gamma=params["focal_gamma"])

        best_auc, patience = 0.0, 0

        for epoch in range(50):
            model.train()
            for xb_num, xb_cat, yb in dl_tr:
                xb_num = xb_num.to(DEVICE)
                xb_cat = xb_cat.to(DEVICE)
                yb = yb.to(DEVICE)

                optimizer.zero_grad()
                logits = model(xb_num, xb_cat)
                loss = criterion(logits, yb)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            scheduler.step()

            # val
            model.eval()
            preds, tgts = [], []
            with torch.no_grad():
                for xb_num, xb_cat, yb in dl_va:
                    xb_num = xb_num.to(DEVICE)
                    xb_cat = xb_cat.to(DEVICE)
                    p = torch.sigmoid(model(xb_num, xb_cat)).cpu().numpy()
                    preds.append(p)
                    tgts.append(yb.numpy())

            val_auc = roc_auc_score(np.concatenate(tgts), np.concatenate(preds))

            if val_auc > best_auc + 1e-6:
                best_auc = val_auc
                patience = 0
            else:
                patience += 1
                if patience >= 7:
                    break

        fold_scores.append(best_auc)

    return float(np.mean(fold_scores))

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=BASE_SEED))
study.optimize(objective, n_trials=25, show_progress_bar=True)

best_params = study.best_params
print("\n‚úÖ Optuna ÏôÑÎ£å!")
print(f"üèÜ Best CV AUC: {study.best_value:.5f}")
print("üîß Best Parameters:")
for k, v in best_params.items():
    print(f"   {k}: {v}")

# ==================== 6. ÏµúÏ¢Ö ÌïôÏäµ: 5-Fold √ó 3 Seeds ÏïôÏÉÅÎ∏î ====================

print("\n" + "="*80)
print("üöÄ ÏµúÏ¢Ö ÌïôÏäµ: 5-Fold √ó 3 Seeds = 15 Models ÏïôÏÉÅÎ∏î")
print("="*80)

SEEDS = [42, 123, 777]

all_oof_preds = []
all_test_preds = []
all_fold_aucs = []

for seed_idx, seed in enumerate(SEEDS, 1):
    print(f"\n{'='*80}")
    print(f"üå± Seed {seed_idx}/3 (seed={seed})")
    print(f"{'='*80}")

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    oof_preds = np.zeros(len(X_num_all), dtype=np.float32)
    test_preds = np.zeros(len(test_num_all), dtype=np.float32)

    for fold, (tr_idx, va_idx) in enumerate(kfold.split(X_num_all, y_all), 1):
        print(f"   Fold {fold}/5...")

        X_tr_raw, X_va_raw = X_num_all[tr_idx], X_num_all[va_idx]
        Xc_tr, Xc_va = X_cat_all[tr_idx], X_cat_all[va_idx]
        y_tr, y_va = y_all[tr_idx], y_all[va_idx]

        # ‚úÖ FoldÎ≥Ñ scaler (ÎàÑÏàò Ï†úÍ±∞)
        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X_tr_raw)
        X_va = scaler.transform(X_va_raw)
        X_te = scaler.transform(test_num_all)

        dl_tr = DataLoader(
            TabDataset(X_tr, Xc_tr, y_tr),
            batch_size=best_params["batch_size"],
            shuffle=True,
            drop_last=True,   # ‚úÖ BN ÏïàÏ†ïÌôî
        )
        dl_va = DataLoader(
            TabDataset(X_va, Xc_va, y_va),
            batch_size=512,
            shuffle=False,
        )

        model = DeepCrossNetworkV2(
            num_dim=len(NUM_COLS),
            cat_dims=cat_dims,
            hidden=best_params["hidden"],
            n_cross=best_params["n_cross"],
            dropout=best_params["dropout"],
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=best_params["lr"], weight_decay=best_params["wd"])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

        if best_params["loss_type"] == "bce":
            pos_weight = compute_pos_weight(y_tr)
            criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            criterion = FocalLoss(alpha=best_params["focal_alpha"], gamma=best_params["focal_gamma"])

        best_auc, best_state, patience = 0.0, None, 0

        for epoch in range(80):
            model.train()
            for xb_num, xb_cat, yb in dl_tr:
                xb_num = xb_num.to(DEVICE)
                xb_cat = xb_cat.to(DEVICE)
                yb = yb.to(DEVICE)

                optimizer.zero_grad()
                logits = model(xb_num, xb_cat)
                loss = criterion(logits, yb)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            scheduler.step()

            if epoch % 5 == 0 or epoch > 50:
                model.eval()
                preds, tgts = [], []
                with torch.no_grad():
                    for xb_num, xb_cat, yb in dl_va:
                        xb_num = xb_num.to(DEVICE)
                        xb_cat = xb_cat.to(DEVICE)
                        p = torch.sigmoid(model(xb_num, xb_cat)).cpu().numpy()
                        preds.append(p)
                        tgts.append(yb.numpy())

                val_auc = roc_auc_score(np.concatenate(tgts), np.concatenate(preds))

                if val_auc > best_auc + 1e-6:
                    best_auc = val_auc
                    best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
                    patience = 0
                else:
                    patience += 1
                    if patience >= 10:
                        break

        model.load_state_dict(best_state)
        model.eval()

        # OOF
        with torch.no_grad():
            va_num = torch.tensor(X_va, dtype=torch.float32).to(DEVICE)
            va_cat = torch.tensor(Xc_va, dtype=torch.long).to(DEVICE)
            oof_preds[va_idx] = torch.sigmoid(model(va_num, va_cat)).cpu().numpy()

        # Test
        with torch.no_grad():
            te_num = torch.tensor(X_te, dtype=torch.float32).to(DEVICE)
            te_cat = torch.tensor(test_cat_all, dtype=torch.long).to(DEVICE)
            test_preds += torch.sigmoid(model(te_num, te_cat)).cpu().numpy() / 5.0

        fold_auc = roc_auc_score(y_va, oof_preds[va_idx])
        all_fold_aucs.append(fold_auc)
        print(f"   ‚úÖ Fold {fold} AUC: {fold_auc:.5f}")

    seed_oof_auc = roc_auc_score(y_all, oof_preds)
    print(f"\n   üèÜ Seed {seed_idx} OOF AUC: {seed_oof_auc:.5f}")

    all_oof_preds.append(oof_preds)
    all_test_preds.append(test_preds)

final_oof = np.mean(all_oof_preds, axis=0)
final_test = np.mean(all_test_preds, axis=0)
final_auc = roc_auc_score(y_all, final_oof)

print("\n" + "="*80)
print("üéâ ÏµúÏ¢Ö Í≤∞Í≥º")
print("="*80)
print(f"üèÜ ÏµúÏ¢Ö OOF AUC: {final_auc:.5f}")
print(f"üìä Ï†ÑÏ≤¥ Fold AUC: {np.mean(all_fold_aucs):.5f} ¬± {np.std(all_fold_aucs):.5f}")
print(f"üìä ÏµúÍ≥† Fold AUC: {np.max(all_fold_aucs):.5f}")
print(f"üìä ÏµúÏ†Ä Fold AUC: {np.min(all_fold_aucs):.5f}")
print(f"üìä Fold AUC Î≤îÏúÑ: {np.max(all_fold_aucs) - np.min(all_fold_aucs):.5f}")
print("="*80)

submission = pd.DataFrame({
    "index": test_fe["index"] if "index" in test_fe.columns else np.arange(len(test_fe)),
    "voted": final_test
})
submission.to_csv("submission_dcn_final_fixed.csv", index=False)

print(f"\nüíæ Ï†úÏ∂ú ÌååÏùº: submission_dcn_final_fixed.csv")
print(f"üìä ÏòàÏ∏°Í∞í Î≤îÏúÑ: [{final_test.min():.4f}, {final_test.max():.4f}]")
print(f"üìä ÏòàÏ∏°Í∞í ÌèâÍ∑†: {final_test.mean():.4f}")
print(f"üìä ÏòàÏ∏°Í∞í std: {final_test.std():.4f}")

print("\n" + "="*80)
print("üèÜ ÏôÑÎ£å!")
print("="*80)


üèÜ DCN-V2 ÏµúÏ¢Ö Ï†úÏ∂úÏö© (Î≤ÑÍ∑∏/ÎàÑÏàò/ÏïàÏ†ïÏÑ± ÏàòÏ†ï ÏôÑÎ£å)

üìÇ Îç∞Ïù¥ÌÑ∞ Î°úÎìú...
Train: (45532, 79), Test: (11383, 77)
Ìà¨Ìëú ÏïàÌï®(positive) ÎπÑÏú®: 0.547
ÌÅ¥ÎûòÏä§ ÎπÑÏú® (neg:pos) ‚âà 0.83:1
üî® Feature Engineering...


[32m[I 2026-01-29 15:33:43,492][0m A new study created in memory with name: no-name-2ef9fe5a-951d-45cd-9e17-cde845bfd3c2[0m


‚úÖ ÏôÑÎ£å: (45532, 110), (11383, 108)

üìä ÏµúÏ¢Ö ÌîºÏ≤ò: 14 numeric + 3 cat = 17 total

üîß Categorical Encoding (Unseen ÎåÄÏùë)...
   race_cat: 5 categories
   married_cat: 5 categories
   religion_cat: 6 categories

üîç Optuna ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù (25 trials)


Best trial: 0. Best value: 0.764005:   4%|‚ñç         | 1/25 [01:12<28:49, 72.07s/it]

[32m[I 2026-01-29 15:34:55,560][0m Trial 0 finished with value: 0.7640053879388116 and parameters: {'hidden': 256, 'n_cross': 3, 'dropout': 0.23120372808848733, 'lr': 0.0006153331256530192, 'wd': 1.493656855461763e-06, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.3939819704323989, 'focal_gamma': 2.832442640800422}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:   8%|‚ñä         | 2/25 [02:41<31:31, 82.25s/it]

[32m[I 2026-01-29 15:36:24,945][0m Trial 1 finished with value: 0.762821064630377 and parameters: {'hidden': 192, 'n_cross': 2, 'dropout': 0.3049512863264476, 'lr': 0.0021928619507738728, 'wd': 7.476312062252308e-06, 'batch_size': 256, 'loss_type': 'focal', 'focal_alpha': 0.2912139968434072, 'focal_gamma': 2.785175961393014}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  12%|‚ñà‚ñè        | 3/25 [04:14<32:00, 87.31s/it]

[32m[I 2026-01-29 15:37:58,279][0m Trial 2 finished with value: 0.7639425773568747 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.3215089703802877, 'lr': 0.0006579145666993107, 'wd': 1.5673095467235414e-06, 'batch_size': 512, 'loss_type': 'bce', 'focal_alpha': 0.21953442280127677, 'focal_gamma': 2.684233026512157}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  16%|‚ñà‚ñå        | 4/25 [05:27<28:34, 81.63s/it]

[32m[I 2026-01-29 15:39:11,187][0m Trial 3 finished with value: 0.763704079317906 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.3818640804157564, 'lr': 0.0009878277403270854, 'wd': 9.717775305059631e-05, 'batch_size': 512, 'loss_type': 'bce', 'focal_alpha': 0.39391692555291175, 'focal_gamma': 2.7751328233611146}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  20%|‚ñà‚ñà        | 5/25 [06:55<27:57, 83.86s/it]

[32m[I 2026-01-29 15:40:39,018][0m Trial 4 finished with value: 0.7586525121194146 and parameters: {'hidden': 192, 'n_cross': 4, 'dropout': 0.2176985004103839, 'lr': 0.0007397534164346214, 'wd': 1.3667272915456224e-06, 'batch_size': 512, 'loss_type': 'focal', 'focal_alpha': 0.2713506653387179, 'focal_gamma': 2.2809345096873805}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  24%|‚ñà‚ñà‚ñç       | 6/25 [08:01<24:38, 77.81s/it]

[32m[I 2026-01-29 15:41:45,086][0m Trial 5 finished with value: 0.7630089785156249 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.39737738732010347, 'lr': 0.01051019547347606, 'wd': 3.945908811099999e-06, 'batch_size': 512, 'loss_type': 'focal', 'focal_alpha': 0.35425406933718917, 'focal_gamma': 2.0740446517340905}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  28%|‚ñà‚ñà‚ñä       | 7/25 [09:49<26:20, 87.79s/it]

[32m[I 2026-01-29 15:43:33,412][0m Trial 6 finished with value: 0.7616245379879556 and parameters: {'hidden': 384, 'n_cross': 3, 'dropout': 0.26617960497052984, 'lr': 0.00040201101730064396, 'wd': 8.569331925053988e-06, 'batch_size': 512, 'loss_type': 'focal', 'focal_alpha': 0.29444298503238986, 'focal_gamma': 2.119594245938302}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  32%|‚ñà‚ñà‚ñà‚ñè      | 8/25 [10:36<21:11, 74.78s/it]

[32m[I 2026-01-29 15:44:20,353][0m Trial 7 finished with value: 0.7608944977100391 and parameters: {'hidden': 256, 'n_cross': 4, 'dropout': 0.2987591192728782, 'lr': 0.003331094193916386, 'wd': 1.9170041589170666e-05, 'batch_size': 512, 'loss_type': 'focal', 'focal_alpha': 0.26287119621526533, 'focal_gamma': 2.508570691164703}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  36%|‚ñà‚ñà‚ñà‚ñå      | 9/25 [11:52<19:58, 74.91s/it]

[32m[I 2026-01-29 15:45:35,547][0m Trial 8 finished with value: 0.7622297697886578 and parameters: {'hidden': 192, 'n_cross': 4, 'dropout': 0.2457596330983245, 'lr': 0.00042764271132082304, 'wd': 7.400385759087375e-06, 'batch_size': 512, 'loss_type': 'bce', 'focal_alpha': 0.3742921180375436, 'focal_gamma': 2.8036720768991144}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  40%|‚ñà‚ñà‚ñà‚ñà      | 10/25 [12:42<16:51, 67.45s/it]

[32m[I 2026-01-29 15:46:26,296][0m Trial 9 finished with value: 0.7605324073400023 and parameters: {'hidden': 256, 'n_cross': 4, 'dropout': 0.37921825998469866, 'lr': 0.0012975622576159589, 'wd': 2.138729075414893e-06, 'batch_size': 512, 'loss_type': 'focal', 'focal_alpha': 0.20139042610623814, 'focal_gamma': 2.510747302577566}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 11/25 [13:35<14:39, 62.82s/it]

[32m[I 2026-01-29 15:47:18,608][0m Trial 10 finished with value: 0.7630551895053962 and parameters: {'hidden': 256, 'n_cross': 3, 'dropout': 0.20150946386325863, 'lr': 0.02196351972975781, 'wd': 0.00045390921584440366, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.3379949613316222, 'focal_gamma': 2.9682380234043237}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 12/25 [15:05<15:24, 71.09s/it]

[32m[I 2026-01-29 15:48:48,621][0m Trial 11 finished with value: 0.7624779760168441 and parameters: {'hidden': 256, 'n_cross': 3, 'dropout': 0.32888786590033564, 'lr': 0.0003310611456567973, 'wd': 1.302351808341558e-06, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.2148579373428467, 'focal_gamma': 2.6539989106470596}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 13/25 [16:17<14:18, 71.55s/it]

[32m[I 2026-01-29 15:50:01,218][0m Trial 12 finished with value: 0.7632560917767215 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.3465298018258697, 'lr': 0.004330571625749953, 'wd': 6.717715238348049e-05, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.2439270837469243, 'focal_gamma': 2.9922514570539236}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 14/25 [17:30<13:11, 71.99s/it]

[32m[I 2026-01-29 15:51:14,233][0m Trial 13 finished with value: 0.7629899577871331 and parameters: {'hidden': 256, 'n_cross': 3, 'dropout': 0.26316360920364246, 'lr': 0.0007335673594070618, 'wd': 1.0317840912152478e-06, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.32934469713899767, 'focal_gamma': 2.6446736852680854}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 15/25 [18:57<12:44, 76.43s/it]

[32m[I 2026-01-29 15:52:40,964][0m Trial 14 finished with value: 0.76394319489131 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.29242295168022564, 'lr': 0.0017995874107817624, 'wd': 0.0009355903554079245, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.2331112562770756, 'focal_gamma': 2.320850178619817}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 16/25 [19:59<10:48, 72.06s/it]

[32m[I 2026-01-29 15:53:42,853][0m Trial 15 finished with value: 0.7632599701961892 and parameters: {'hidden': 384, 'n_cross': 3, 'dropout': 0.2324637654987361, 'lr': 0.0017933398696491445, 'wd': 0.0007086372936028797, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.32185199124210256, 'focal_gamma': 2.322576260515312}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 17/25 [20:57<09:03, 67.97s/it]

[32m[I 2026-01-29 15:54:41,322][0m Trial 16 finished with value: 0.7636386101874312 and parameters: {'hidden': 256, 'n_cross': 2, 'dropout': 0.28034106536240305, 'lr': 0.0053594115069491586, 'wd': 0.00019554295990246637, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.39743674524519557, 'focal_gamma': 2.3384834454681886}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 18/25 [21:52<07:27, 63.87s/it]

[32m[I 2026-01-29 15:55:35,633][0m Trial 17 finished with value: 0.7636545550262885 and parameters: {'hidden': 256, 'n_cross': 3, 'dropout': 0.24045831877019366, 'lr': 0.007105972441935078, 'wd': 3.3302381560872376e-05, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.25023092472139963, 'focal_gamma': 2.175085040579309}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 19/25 [23:03<06:36, 66.01s/it]

[32m[I 2026-01-29 15:56:46,643][0m Trial 18 finished with value: 0.7638213339595905 and parameters: {'hidden': 384, 'n_cross': 3, 'dropout': 0.2868491473847308, 'lr': 0.001550705024755782, 'wd': 0.00025601264248343474, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.36314596746190864, 'focal_gamma': 2.3992946483188065}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 20/25 [24:18<05:43, 68.72s/it]

[32m[I 2026-01-29 15:58:01,667][0m Trial 19 finished with value: 0.7633173550045644 and parameters: {'hidden': 192, 'n_cross': 2, 'dropout': 0.20412394369743492, 'lr': 0.0022079534246549114, 'wd': 2.4878539518778382e-05, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.3073263473316586, 'focal_gamma': 2.454500202631525}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 0. Best value: 0.764005:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 21/25 [25:38<04:48, 72.06s/it]

[32m[I 2026-01-29 15:59:21,529][0m Trial 20 finished with value: 0.7616955323694735 and parameters: {'hidden': 384, 'n_cross': 4, 'dropout': 0.35536710896088436, 'lr': 0.0010799469301872484, 'wd': 0.000987485397383764, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.22981021675571447, 'focal_gamma': 2.195780426026256}. Best is trial 0 with value: 0.7640053879388116.[0m


Best trial: 21. Best value: 0.764245:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 22/25 [27:16<04:00, 80.12s/it]

[32m[I 2026-01-29 16:01:00,428][0m Trial 21 finished with value: 0.7642452778577706 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.3215959950219359, 'lr': 0.0005948625155744105, 'wd': 2.8232625723379355e-06, 'batch_size': 512, 'loss_type': 'bce', 'focal_alpha': 0.22441917000690262, 'focal_gamma': 2.64411161074485}. Best is trial 21 with value: 0.7642452778577706.[0m


Best trial: 21. Best value: 0.764245:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 23/25 [29:04<02:56, 88.48s/it]

[32m[I 2026-01-29 16:02:48,413][0m Trial 22 finished with value: 0.7641328083386313 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.32988329182816795, 'lr': 0.000475523957762377, 'wd': 3.63495390399594e-06, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.27358283493793467, 'focal_gamma': 2.8684780444982247}. Best is trial 21 with value: 0.7642452778577706.[0m


Best trial: 21. Best value: 0.764245:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 24/25 [30:38<01:30, 90.04s/it]

[32m[I 2026-01-29 16:04:22,102][0m Trial 23 finished with value: 0.7632052163337345 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.32112604095430886, 'lr': 0.0005278760842468562, 'wd': 3.2914161586956905e-06, 'batch_size': 512, 'loss_type': 'bce', 'focal_alpha': 0.28621633863256424, 'focal_gamma': 2.8703825409596897}. Best is trial 21 with value: 0.7642452778577706.[0m


Best trial: 21. Best value: 0.764245: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [32:43<00:00, 78.55s/it] 


[32m[I 2026-01-29 16:06:27,343][0m Trial 24 finished with value: 0.7636934472669935 and parameters: {'hidden': 384, 'n_cross': 2, 'dropout': 0.34034415651795896, 'lr': 0.0005413568118415262, 'wd': 3.38625548255424e-06, 'batch_size': 256, 'loss_type': 'bce', 'focal_alpha': 0.2701020580488975, 'focal_gamma': 2.904074248596353}. Best is trial 21 with value: 0.7642452778577706.[0m

‚úÖ Optuna ÏôÑÎ£å!
üèÜ Best CV AUC: 0.76425
üîß Best Parameters:
   hidden: 384
   n_cross: 2
   dropout: 0.3215959950219359
   lr: 0.0005948625155744105
   wd: 2.8232625723379355e-06
   batch_size: 512
   loss_type: bce
   focal_alpha: 0.22441917000690262
   focal_gamma: 2.64411161074485

üöÄ ÏµúÏ¢Ö ÌïôÏäµ: 5-Fold √ó 3 Seeds = 15 Models ÏïôÏÉÅÎ∏î

üå± Seed 1/3 (seed=42)
   Fold 1/5...
   ‚úÖ Fold 1 AUC: 0.77548
   Fold 2/5...
   ‚úÖ Fold 2 AUC: 0.76460
   Fold 3/5...
   ‚úÖ Fold 3 AUC: 0.76079
   Fold 4/5...
   ‚úÖ Fold 4 AUC: 0.75676
   Fold 5/5...
   ‚úÖ Fold 5 AUC: 0.76069

   üèÜ Seed 1 OOF AUC: 0.7