# 01_MLP_v1_baseline 요약

- 모델: VotingMLP/Residual MLP
- 피처: neg/pos/neutral/confident, Big5 diff/strength, wr/wf/cred, demo ordinal, hand/married/race/religion
- 학습/평가: 단일 split, BCEWithLogits + pos_weight, ReduceLROnPlateau
- 제출파일: submission_01_MLP_v1_baseline.csv


In [202]:
import pandas as pd
import numpy as np

In [203]:
df_model = pd.read_csv("../../data/raw/train.csv")

### 1. 공통 전처리/피처 생성 + train/test 일괄 적용


In [204]:
# =========================
# Unified train/test pipeline
# =========================
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# 공통 전처리/피처 생성 함수
# -----------------------------
def build_features(df_raw, cfg=None, is_train=True):
    df = df_raw.copy()

    # 0) voted -> voted_bin (train에만 존재)
    if is_train and "voted" in df.columns:
        df["voted_bin"] = (df["voted"] == 2).astype(int)

    # 1) age_group -> ordinal
    if "age_group" in df.columns:
        age_map = {"10s":1, "20s":2, "30s":3, "40s":4, "50s":5, "60s":6, "+70s":7}
        df["age_group_ord"] = df["age_group"].map(age_map).astype("float32")

    # 2) education (0=무응답 -> NaN -> train 평균으로 대치)
    if "education" in df.columns:
        df["education"] = pd.to_numeric(df["education"], errors="coerce")
        df.loc[df["education"] == 0, "education"] = np.nan
        if cfg is not None:
            df["education"] = df["education"].fillna(cfg["education_mean"])
        df["education"] = df["education"].astype("float32")

    # 3) married_bin (1=미혼 vs 기타, 0=무응답 NaN)
    if "married" in df.columns:
        df["married"] = pd.to_numeric(df["married"], errors="coerce")
        df.loc[df["married"] == 0, "married"] = np.nan
        df["married_bin"] = (df["married"] == 1).astype("float32")

    # 4) hand_bin (3=양손 vs 기타, 0=무응답 NaN)
    if "hand" in df.columns:
        df["hand"] = pd.to_numeric(df["hand"], errors="coerce")
        df.loc[df["hand"] == 0, "hand"] = np.nan
        df["hand_bin"] = (df["hand"] == 3).astype("float32")

    # 5) urban_ord (0=무응답 NaN)
    if "urban" in df.columns:
        df["urban"] = pd.to_numeric(df["urban"], errors="coerce")
        df.loc[df["urban"] == 0, "urban"] = np.nan
        df["urban_ord"] = df["urban"].astype("float32")

    # 6) race/religion 단순화: train에서 뽑은 top-k만 살리고 나머지 Other
    def simplify_major_other(series, majors):
        return series.apply(lambda x: x if x in majors else "Other")

    if "race" in df.columns and cfg is not None:
        df["race_simple"] = simplify_major_other(df["race"], cfg["race_majors"]).astype(str)

    if "religion" in df.columns and cfg is not None:
        df["religion_simple"] = simplify_major_other(df["religion"], cfg["religion_majors"]).astype(str)

    # 7) Q_A: neg_att / pos_att / confident_ratio / neutral_ratio
    neg_cols = ["QbA","QcA","QjA","QmA","QoA","QsA"]
    pos_cols = ["QkA","QqA"]
    other_cols = ["QeA","QfA","QhA","QrA"]

    for col in neg_cols + pos_cols + other_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    if all(c in df.columns for c in neg_cols):
        df["neg_att"] = df[neg_cols].mean(axis=1)

    if all(c in df.columns for c in pos_cols):
        df["pos_att"] = df[pos_cols].mean(axis=1)

    if all(c in df.columns for c in other_cols):
        other = df[other_cols]
        df["neutral_ratio"] = (other == 3).mean(axis=1).astype("float32")
        df["confident_ratio"] = ((other <= 2) | (other >= 4)).mean(axis=1).astype("float32")

    # 8) TP Big5 diff/strength
    tp_pairs = {
        "extraversion": ("tp01", "tp06"),
        "agreeableness": ("tp07", "tp02"),
        "conscientiousness": ("tp03", "tp08"),
        "neuroticism": ("tp04", "tp09"),
        "openness": ("tp05", "tp10"),
    }
    for trait, (a, b) in tp_pairs.items():
        if a in df.columns and b in df.columns:
            df[a] = pd.to_numeric(df[a], errors="coerce")
            df[b] = pd.to_numeric(df[b], errors="coerce")
            df[f"{trait}_diff"] = (df[a] - df[b]).astype("float32")
            df[f"{trait}_strength"] = df[f"{trait}_diff"].abs().astype("float32")

    # 9) 단어 인지: wr_sum / wf_sum / word_credibility
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]

    if all(c in df.columns for c in wr_cols):
        df["wr_sum"] = df[wr_cols].sum(axis=1).astype("float32")

    if all(c in df.columns for c in wf_cols):
        df["wf_sum"] = df[wf_cols].sum(axis=1).astype("float32")

    if "wr_sum" in df.columns and "wf_sum" in df.columns:
        df["word_credibility"] = (df["wr_sum"] - df["wf_sum"]).astype("float32")

    # 9-1) 단어 인지 구간화 (EDA 기준)
    if "word_credibility" in df.columns:
        df["cred_bin"] = pd.cut(
            df["word_credibility"],
            bins=[-3, 1, 6, 13],
            labels=["Low", "Mid", "High"]
        )

    return df


# -----------------------------
# (train에서) cfg 만들기: education 평균 + race/religion top-k
# -----------------------------
train = pd.read_csv("../../data/raw/train.csv")
test  = pd.read_csv("../../data/raw/test_x.csv")

cfg = {}
edu = pd.to_numeric(train["education"], errors="coerce").replace(0, np.nan)
cfg["education_mean"] = float(edu.mean())

cfg["race_majors"] = set(train["race"].value_counts(dropna=True).head(5).index)
cfg["religion_majors"] = set(train["religion"].value_counts(dropna=True).head(5).index)

# -----------------------------
# train/test 동일 파이프라인 적용
# -----------------------------
df_train_final = build_features(train, cfg=cfg, is_train=True)
df_test_final  = build_features(test,  cfg=cfg, is_train=False)

# -----------------------------
# 모델 입력 컬럼
# -----------------------------
target = "voted_bin"
feature_cols = [
    "neg_att", "pos_att", "neutral_ratio", "confident_ratio",
    "extraversion_diff", "extraversion_strength",
    "agreeableness_diff", "agreeableness_strength",
    "conscientiousness_diff", "conscientiousness_strength",
    "neuroticism_diff", "neuroticism_strength",
    "openness_diff", "openness_strength",
    "wr_sum", "wf_sum", "word_credibility",
    "age_group_ord", "education", "urban_ord",
    "hand_bin", "married_bin",
    "race_simple", "religion_simple",
]

cat_cols = ["race_simple", "religion_simple"]

# -----------------------------
# 범주형 인코딩 (train 기준) + 결측치 대치 (train 평균 기준)
# -----------------------------
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df_train_final[col] = le.fit_transform(df_train_final[col].astype(str))
    df_test_final[col] = le.transform(df_test_final[col].astype(str))
    label_encoders[col] = le

num_cols = [c for c in feature_cols if c not in cat_cols]
numeric_train = df_train_final[num_cols].apply(pd.to_numeric, errors="coerce")
numeric_test = df_test_final[num_cols].apply(pd.to_numeric, errors="coerce")
train_means = numeric_train.mean()

df_train_final[num_cols] = numeric_train.fillna(train_means)
df_test_final[num_cols] = numeric_test.fillna(train_means)

X = df_train_final[feature_cols].values
y = df_train_final[target].values

print(df_train_final.shape)
df_train_final.head()


(45532, 103)


Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,conscientiousness_diff,conscientiousness_strength,neuroticism_diff,neuroticism_strength,openness_diff,openness_strength,wr_sum,wf_sum,word_credibility,cred_bin
0,0,3.0,363,4.0,1370,5.0,997,1.0,1024,2.0,...,-2.0,2.0,-3.0,3.0,-1.0,1.0,7.0,0.0,7.0,High
1,1,5.0,647,5.0,1313,3.0,3387,5.0,2969,1.0,...,-4.0,4.0,0.0,0.0,-3.0,3.0,8.0,0.0,8.0,High
2,2,4.0,1623,1.0,1480,1.0,1021,4.0,3374,5.0,...,-5.0,5.0,4.0,4.0,0.0,0.0,10.0,1.0,9.0,High
3,3,3.0,504,3.0,2311,4.0,992,3.0,3245,1.0,...,-2.0,2.0,0.0,0.0,-2.0,2.0,5.0,0.0,5.0,Mid
4,4,1.0,927,1.0,707,5.0,556,2.0,1062,1.0,...,-5.0,5.0,4.0,4.0,-6.0,6.0,11.0,1.0,10.0,High


In [205]:
df_train_final.isna().mean().sort_values(ascending=False).head(50)


urban                         0.007072
hand                          0.003536
married                       0.002043
cred_bin                      0.000176
wr_02                         0.000000
wr_11                         0.000000
wr_10                         0.000000
wr_09                         0.000000
wr_08                         0.000000
wr_07                         0.000000
wr_06                         0.000000
wr_05                         0.000000
wr_04                         0.000000
wr_03                         0.000000
wf_03                         0.000000
wr_01                         0.000000
tp04                          0.000000
wf_02                         0.000000
wf_01                         0.000000
voted                         0.000000
tp10                          0.000000
tp09                          0.000000
tp08                          0.000000
tp07                          0.000000
tp06                          0.000000
wr_12                    

In [206]:
df_train_final["voted_bin"].value_counts(normalize=True)


voted_bin
1    0.546824
0    0.453176
Name: proportion, dtype: float64

### 2. 결측/분포 체크


### 3. Train/Validation 분리


In [207]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df_train_final[feature_cols].values
y = df_train_final[target].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# scale numeric features (train 기준)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


### 4. 딥러닝 모델 정의 (PyTorch MLP)


In [208]:
import torch
import torch.nn as nn

class ResidualMLPBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(dim, dim)
        self.norm = nn.LayerNorm(dim)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        x = x + residual
        x = self.norm(x)
        return x

class VotingMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, num_blocks=4, dropout=0.3):
        super().__init__()
        self.in_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
        )
        self.blocks = nn.Sequential(*[ResidualMLPBlock(hidden_dim, dropout=dropout) for _ in range(num_blocks)])
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        x = self.in_proj(x)
        x = self.blocks(x)
        return self.head(x)


### 5. 학습 세팅


In [209]:
model = VotingMLP(input_dim=X.shape[1], hidden_dim=512, num_blocks=4, dropout=0.3)

# 불균형 보정 (pos_weight)
pos_weight = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-6)
pos_weight_t = torch.tensor([pos_weight], dtype=torch.float32)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=3e-4)

# val loss 기준으로 lr 감소
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-5
)

X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_val_t = torch.tensor(X_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)


### 6. 학습 루프 (20~30 epoch)


In [210]:
best_val = float('inf')
patience = 6
patience_ctr = 0

for epoch in range(50):
    model.train()
    optimizer.zero_grad()

    logits = model(X_train_t)
    loss = criterion(logits, y_train_t)

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_logits = model(X_val_t)
        val_loss = criterion(val_logits, y_val_t)

    scheduler.step(val_loss)

    if val_loss.item() < best_val - 1e-4:
        best_val = val_loss.item()
        patience_ctr = 0
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    else:
        patience_ctr += 1

    if epoch % 5 == 0:
        print(f"Epoch {epoch} | train {loss.item():.4f} | val {val_loss.item():.4f}")

    if patience_ctr >= patience:
        print("Early stopping")
        break

# best model restore
if 'best_state' in locals():
    model.load_state_dict(best_state)


Epoch 0 | train 0.6333 | val 0.5729
Epoch 5 | train 0.5496 | val 0.5498
Epoch 10 | train 0.5364 | val 0.5222
Epoch 15 | train 0.5284 | val 0.5217
Epoch 20 | train 0.5245 | val 0.5199
Epoch 25 | train 0.5235 | val 0.5173
Epoch 30 | train 0.5209 | val 0.5158
Epoch 35 | train 0.5202 | val 0.5150
Epoch 40 | train 0.5202 | val 0.5139
Epoch 45 | train 0.5195 | val 0.5142
Early stopping


### 7. Validation 성능평가


In [211]:
from sklearn.metrics import accuracy_score, roc_auc_score

model.eval()
with torch.no_grad():
    val_logits = model(X_val_t)
    val_probs = torch.sigmoid(val_logits).cpu().numpy().ravel()
    val_pred = (val_probs >= 0.5).astype(int)

acc = accuracy_score(y_val, val_pred)
auc = roc_auc_score(y_val, val_probs)

print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation ROC-AUC: {auc:.4f}")


Validation Accuracy: 0.6916
Validation ROC-AUC: 0.7620


### 8. Test 예측 & 제출 파일 생성


In [187]:
# test 예측
X_test = df_test_final[feature_cols].values
X_test = scaler.transform(X_test)

model.eval()
X_test_t = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    test_logits = model(X_test_t)
    test_probs = torch.sigmoid(test_logits).cpu().numpy().ravel()

# index 기준으로 제출 파일 생성
test_pred = pd.DataFrame({
    "index": df_test_final["index"].values,
    "voted": test_probs
})

sub = pd.read_csv("../../data/raw/sample_submission.csv")

sub["voted"] = test_probs

sub.to_csv("submission_01_MLP_v1_baseline.csv", index=False)
print("saved submission_01_MLP_v1_baseline.csv")



saved submission_prob_class2.csv


In [188]:
print(sub.head())
print(sub["voted"].describe())
print("rows:", len(sub), "cols:", sub.columns.tolist())

   index     voted
0      0  0.892855
1      1  0.781640
2      2  0.400190
3      3  0.219252
4      4  0.664711
count    11383.000000
mean         0.525277
std          0.260404
min          0.125817
25%          0.310033
50%          0.422660
75%          0.766478
max          0.997147
Name: voted, dtype: float64
rows: 11383 cols: ['index', 'voted']


In [163]:
auc_pos1 = roc_auc_score(y_val, val_probs)        # 지금 방식 (pos=1)
auc_pos2 = roc_auc_score(1 - y_val, val_probs)    # pos를 반대로 본 것과 동일

print(auc_pos1, auc_pos2, auc_pos1 + auc_pos2)


0.7626353001051943 0.23736469989480577 1.0


### 전체 흐름 요약
1. **데이터 로드**: train/test 원본을 불러옴
2. **공통 전처리/파생 변수 생성**: Q_A 태도 요약, Big5 diff/strength, 단어 인지, 인구통계 파생, 범주 단순화
3. **결측/분포 확인**: 결측 비율과 클래스 분포 확인
4. **학습/검증 분리**: stratified split
5. **스케일링**: train 기준 표준화 후 val/test에 동일 적용
6. **모델 정의**: Residual MLP
7. **학습 세팅**: AdamW, pos_weight, LR scheduler, early stopping
8. **학습/평가**: validation accuracy/AUC 확인
9. **테스트 예측**: 확률 출력
10. **제출 파일 생성**: 확률값을 submission에 저장


### 사용한 모델/학습 요약
- 모델: VotingMLP (Residual MLP)
- 구조: hidden_dim=512, num_blocks=4, dropout=0.3, GELU + LayerNorm
- 손실: BCEWithLogitsLoss + pos_weight
- 최적화: AdamW (lr=3e-4, weight_decay=3e-4)
- 스케줄러: ReduceLROnPlateau (val loss 기준)
- 기타: StandardScaler, gradient clipping, early stopping
