# 성별 추론

### 사용자의 로그인 전 행동 패턴을 보고 사용자의 성별을 추론하는 실습입니다.

## 1) 환경 구성
- 실습을 원활하게 진행하기 위해 환경을 구성합니다. 

In [None]:
# Cell 1
import os
import json
import warnings
import unicodedata
import subprocess
from pathlib import Path

from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from joblib import dump
from matplotlib.colors import ListedColormap

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (classification_report, confusion_matrix, f1_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score,)

warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# ---------- 경로/상수 ----------
DATASETS_DIR  = Path("datasets") / "gender" 
INPUT_PATH    = "datasets/gender/processed_user_behavior.joined.csv" # Search_keyword -> Category, Product_id -> Category 매핑 테이블과 원본 테이블 조인
OUTPUT_DIR    = DATASETS_DIR                              
ARTIFACT_DIR  = Path("models") / "gender"     

DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

MIN_PRELOGIN_EVENTS = 2

KNOWN_CATS = ["Books", "Electronics", "Gaming", "Home", "Fashion"]

RANDOM_STATE = 42
PRODUCTION_THRESHOLD = 0.500

def norm(s: pd.Series) -> pd.Series:
    """공백 제거 + 유니코드 NFKC 정규화(NA 안전)"""
    s = s.astype("string").str.strip()
    return s.apply(lambda x: unicodedata.normalize("NFKC", x) if pd.notna(x) else x)

print("[Done]")

## 2) 원본 데이터 로드 & 로그인 전 구간 추출
- 원본과 매핑 테이블을 조인한 데이터를 읽어와 로그인 전 구간을 추출합니다.

In [None]:
# Cell 2
df = pd.read_csv(INPUT_PATH, low_memory=False)

df["ts"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["session_id", "ts"]).copy()

sort_cols = ["session_id", "ts"]
if "event_id" in df.columns:
    sort_cols.append("event_id")
df = df.sort_values(sort_cols).reset_index(drop=True)

print("원본 행 수:", len(df))

uid_str  = df["user_id"].astype("string").str.strip()
anon_like = {"", "0", "-1", "None", "none", "NULL", "null", "NaN", "nan"}
has_uid = uid_str.notna() & ~uid_str.isin(anon_like)

appeared = has_uid.groupby(df["session_id"]).cummax()
pre = df.loc[~appeared].copy()

prelogin_counts = pre.groupby("session_id").size()
valid_sessions = prelogin_counts.index[prelogin_counts >= MIN_PRELOGIN_EVENTS]
pre = pre[pre["session_id"].isin(valid_sessions)].copy()

print("로그인 전 로그 필터 후 행 수:", len(pre))
print("로그인 전 유일 세션 수:", pre["session_id"].nunique())


## 3) Target Label 생성 & 로그인 전 구간 수치 집계
- 정답지 역할인 Target Label을 생성하고, 로그인 완료한 세션 수를 집계합니다.

In [None]:
# Cell 3
df_gender = df.copy()
df_gender["gender_norm"] = (
    df_gender["gender"]
      .astype("string").str.strip().str.upper()
      .replace({"FEMALE":"F", "MALE":"M"})
)
lab_full = (
    df_gender[df_gender["gender_norm"].isin(["M","F"])] # 로그인 이후의 세션만 확인
      .groupby("session_id")["gender_norm"].agg(lambda s: s.iloc[0])
)

agg_num = pre.groupby("session_id").agg(
    n_events=("session_id", "size"),
    search_count_sum=("search_count", "sum"),
    cart_item_count_sum=("cart_item_count", "sum"),
    page_depth_mean=("page_depth", "mean"),
    last_elapsed_mean=("last_action_elapsed", "mean"),
    unique_pages=("current_state", "nunique"),
    unique_categories=("resolved_category", "nunique"),
).fillna(0.0)

first_ts = pre.groupby("session_id")["ts"].min()
agg_num["start_hour"] = first_ts.dt.hour
agg_num["start_weekday"] = first_ts.dt.weekday

keep_sessions = agg_num.index.intersection(lab_full.index)
X_num = agg_num.loc[keep_sessions].copy()
y = lab_full.loc[keep_sessions].rename("gender").copy()

print("로그인 완료한 세션 수:", len(keep_sessions))
print("성별 분포:\n", y.value_counts(dropna=False))


## 4) 카테고리 카운트 & 파생 컬럼 생성 (Feature Engineering)
- 카테고리 등장 횟수를 저장하는 카운트 컬럼, 횟수를 비율로 치환하는 파생 컬럼을 생성합니다.

In [None]:
# Cell 4
KNOWN_CATS_NORM = [unicodedata.normalize("NFKC", c.strip()) for c in KNOWN_CATS]
norm_to_orig = {unicodedata.normalize("NFKC", c.strip()): c for c in KNOWN_CATS}

pre_cat_norm = norm(pre["resolved_category"])

mask = pre_cat_norm.isin(KNOWN_CATS_NORM)
pre_kept = pre[mask].copy()
pre_kept["cat_norm"] = pre_cat_norm[mask].values
pre_kept["one"] = 1

cat_cnt = pre_kept.pivot_table(
    index="session_id",
    columns="cat_norm",
    values="one",
    aggfunc="sum",
    fill_value=0
)

cat_cnt = cat_cnt.reindex(columns=KNOWN_CATS_NORM, fill_value=0)
cat_cnt.columns = [norm_to_orig[c] for c in cat_cnt.columns]

cat_cnt.columns = [f"cat_cnt::{c}" for c in cat_cnt.columns]
cat_cnt = cat_cnt.reindex(X_num.index).fillna(0).astype(int)

cat_cnt_cols = list(cat_cnt.columns)

cat_prop = cat_cnt.div(X_num["n_events"].replace(0, 1), axis=0)
cat_prop.columns = [c.replace("cat_cnt::", "cat_prop::") for c in cat_cnt_cols]

cat_log = np.log1p(cat_cnt)
cat_log.columns = [c.replace("cat_cnt::", "cat_log::") for c in cat_cnt_cols]

print("카테고리 카운트 컬럼:", list(cat_cnt.columns))
print("카테고리 파생 컬럼:", list(cat_prop.columns[:3]) + list(cat_log.columns[:3]))


## 5) 파생 컬럼 결합 및 저장 (Feature Engineering)

In [None]:
# Cell 5
X = (
    X_num
    .join(cat_cnt, how="left")
    .join(cat_prop, how="left")
    .join(cat_log, how="left")
).fillna(0)

X_ = X.copy()
if X_.index.name != "session_id":
    X_.index.name = "session_id"

dataset = X_.join(y).reset_index()

print("dataset shape:", dataset.shape)
print(dataset.head(10))

FEATURES_CSV = f"{OUTPUT_DIR}/prelogin_gender_features.csv"
FEATURES_PARQUET = f"{OUTPUT_DIR}/prelogin_gender_features.parquet"

dataset.to_csv(FEATURES_CSV, index=False)
dataset.to_parquet(FEATURES_PARQUET, index=False)

print(f"\nSaved: {FEATURES_CSV}")
print(f"Saved: {FEATURES_PARQUET}")

## 6) 학습용 / 검증용 데이터셋 분할
- 데이터를 학습용 / 검증용 데이터셋으로 분할합니다.

In [None]:
# Cell 6
dataset["gender"] = (
    dataset["gender"].astype("string").str.strip().str.upper()
    .replace({"FEMALE":"F","MALE":"M"})
)

non_feature = {"session_id","gender"}
feature_cols = [c for c in dataset.columns if c not in non_feature]
X = dataset[feature_cols].copy()
y_bin = dataset["gender"].map({"F":0, "M":1}).astype(int)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, valid_idx = next(sss.split(X, y_bin))
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y_bin.iloc[train_idx], y_bin.iloc[valid_idx]

print("Train:", X_train.shape, " Valid:", X_valid.shape)
print("Label dist (train):\n", y_train.value_counts(normalize=True).rename({0:"F",1:"M"}))
print("Label dist (valid):\n", y_valid.value_counts(normalize=True).rename({0:"F",1:"M"}))


## 7) 모델 학습
- 학습용 데이터 셋으로 모델을 학습합니다.

In [None]:
# Cell 7
def tune_threshold(model, X_va, y_va, search=(0.2,0.8,61)):
    """Macro-F1 최대화 임계값 탐색 (predict_proba 사용 가능한 모델 전제)"""
    lo, hi, n = search
    ths = np.linspace(lo, hi, n)
    proba = model.predict_proba(X_va)[:,1]
    best_t, best_f1 = 0.5, -1
    for t in ths:
        preds = (proba >= t).astype(int)
        f1 = f1_score(y_va, preds, average="macro")
        if f1 > best_f1:
            best_t, best_f1 = float(t), float(f1)
    return best_t, best_f1, proba

def evaluate(model, X_va, y_va, threshold=0.5, name="Model"):
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_va)[:,1]
        y_hat = (proba >= threshold).astype(int)
    else:
        y_hat = model.predict(X_va)
        proba = None

    print(f"\n=== {name} @ threshold={threshold:.3f} ===")
    print(classification_report(y_va, y_hat, target_names=["F","M"], digits=4))
    print("Confusion matrix [rows=true F,M | cols=pred F,M]:\n", confusion_matrix(y_va, y_hat))
    return y_hat, proba

def print_scores(name, y_true, proba, t):
    y_pred = (proba >= t).astype(int)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    acc = (y_true.values == y_pred).mean()
    print(f"[{name}] Macro-F1 : {macro_f1:.4f} / Accuracy : {acc:.4f}")

CALIB_METHOD = globals().get("CALIB_METHOD", "isotonic")
CALIB_CV     = globals().get("CALIB_CV", 5)

log_best = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        solver="saga", penalty="l2", C=1.0,
        max_iter=5000, random_state=RANDOM_STATE,
    ))
])
log_best.fit(X_train, y_train)

try:
    cal_log = CalibratedClassifierCV(estimator=log_best, method=CALIB_METHOD, cv=CALIB_CV)
except TypeError:
    cal_log = CalibratedClassifierCV(base_estimator=log_best, method=CALIB_METHOD, cv=CALIB_CV)

cal_log.fit(X_train, y_train)
print(f"[Calibration] done: method={CALIB_METHOD}, cv={CALIB_CV}")

best_params_jsonable = {
    "scaler": {"name": "StandardScaler"},
    "clf": {"name": "LogisticRegression", "params": log_best.named_steps["clf"].get_params(deep=False)}
}
best_cv_f1_macro = None

## 8) 학습 평가
- 검증용 데이터 셋으로 모델을 검증하여 평가합니다.

In [None]:
# Cell 8

yhat_prod, _ = evaluate(
    cal_log, X_valid, y_valid,
    threshold=PRODUCTION_THRESHOLD,
    name=f"LogisticCal (prod={PRODUCTION_THRESHOLD:.3f})"
)
proba_v = cal_log.predict_proba(X_valid)[:, 1]
print_scores(f"LogisticCal (prod@{PRODUCTION_THRESHOLD:.3f})", y_valid, proba_v, PRODUCTION_THRESHOLD)
print("\n" + "-"*82)

sess_ids = (
    dataset.iloc[valid_idx]["session_id"].values
    if "session_id" in dataset.columns else np.arange(len(valid_idx))
)
pred_bin   = (proba_v >= PRODUCTION_THRESHOLD).astype(int)
pred_col   = f"y_pred@{PRODUCTION_THRESHOLD:.3f}"

out_df = pd.DataFrame({
    "session_id": sess_ids,
    "y_true": y_valid.values,
    pred_col: pred_bin,
    "proba_cal": proba_v,
})

extra_cols = [c for c in ["age", "age_group"] if c in dataset.columns]
if extra_cols:
    out_df = out_df.merge(
        dataset[["session_id"] + extra_cols].drop_duplicates("session_id"),
        on="session_id", how="left"
    )

metrics_df = out_df.copy()
if "age_group" not in metrics_df.columns:
    post = df.loc[appeared].copy()
    if "age" in post.columns:
        age_post_by_sess = (
            pd.to_numeric(post["age"], errors="coerce")
              .groupby(post["session_id"])
              .apply(lambda s: s.dropna().iloc[0] if len(s.dropna()) else np.nan)
              .rename("age")
        )
        age_map = age_post_by_sess.to_frame()
        def to_age_bucket(x):
            if pd.isna(x): return np.nan
            x = float(x)
            return "young" if x < 25 else ("middle" if x < 50 else "old")
        age_map["age_group"] = age_map["age"].apply(to_age_bucket)
        age_map = age_map.reset_index()  # session_id 복원
        metrics_df = metrics_df.merge(age_map, on="session_id", how="left")

metrics_df["age_group"] = (
    metrics_df.get("age_group", pd.Series([np.nan]*len(metrics_df)))
    .astype("string")
    .fillna("unknown")
)

rows = []
for g, sub in metrics_df.groupby("age_group", dropna=False):
    y_true = sub["y_true"].astype(int).to_numpy()
    y_hat  = sub[pred_col].astype(int).to_numpy()
    acc = float((y_true == y_hat).mean())
    f1  = float(f1_score(y_true, y_hat, average="macro")) if len(np.unique(y_true)) > 1 else np.nan
    rows.append({"age_group": str(g), "acc": acc, "macro_f1": f1})

scores_by_age = pd.DataFrame(rows)
order = pd.Categorical(scores_by_age["age_group"], categories=["young","middle","old","unknown"], ordered=True)
scores_by_age = (scores_by_age.assign(_ord=order).sort_values("_ord").drop(columns="_ord").reset_index(drop=True))
scores_by_age[["acc","macro_f1"]] = scores_by_age[["acc","macro_f1"]].round(4)

print("\n[Age-group metrics]")
print(scores_by_age[["age_group","acc","macro_f1"]].to_string(index=False))


## 9) Katib 하이퍼파라미터 튜닝 & 추출값 확인
- Katib 하이퍼파라미터 튜닝한 결과값을 확인합니다.

In [None]:
# Cell 9

KATIB_NS  = os.environ.get("KATIB_NS",  "kbm-u-kubeflow-tutorial")
KATIB_EXP = os.environ.get("KATIB_EXP", "gender-logistic-random")

def fetch_katib_params(ns: str, exp: str):
    out = subprocess.check_output(
        ["kubectl", "-n", ns, "get", "experiment", exp, "-o", "json"],
        stderr=subprocess.STDOUT
    )
    data = json.loads(out.decode("utf-8"))
    trial = (
        data.get("status", {}).get("currentOptimalTrial") or
        data.get("status", {}).get("optimalTrial") or
        data.get("status", {}).get("bestTrial") or
        {}
    )
    params = trial.get("parameterAssignments", [])
    if not params:
        raise RuntimeError("Katib parameterAssignments를 찾지 못했습니다.")
    return params

KATIB_PARAMS_LIST = fetch_katib_params(KATIB_NS, KATIB_EXP)
KATIB_PARAM_MAP   = {d["name"]: str(d["value"]).strip() for d in KATIB_PARAMS_LIST}

print(f"[Katib] {KATIB_NS}/{KATIB_EXP} 최적 하이퍼파라미터:")
display(pd.DataFrame(KATIB_PARAMS_LIST))
print("\nparam map:", KATIB_PARAM_MAP)


## 10) Katib 최적 하이퍼파라미터로 모델 학습
- Katib에서 찾은 최적의 하이퍼파라미터로 모델을 재학습합니다.

In [None]:
# Cell 10

def _get_str(m, k, default=None):
    v = m.get(k, default)
    return None if v is None else str(v).strip()

def _get_float(m, k, default=None):
    v = m.get(k, None)
    try: return float(v)
    except: return default

def _get_int(m, k, default=None):
    v = m.get(k, None)
    try: return int(float(v))
    except: return default

# ---- Katib 파라미터 파싱 ----
penalty   = _get_str(KATIB_PARAM_MAP, "penalty", "l2").lower()
C         = _get_float(KATIB_PARAM_MAP, "C", 1.0)
l1_ratio  = _get_float(KATIB_PARAM_MAP, "l1_ratio", None)
scaler_nm = _get_str(KATIB_PARAM_MAP, "scaler", "standard").lower()
cw_nm     = _get_str(KATIB_PARAM_MAP, "class_weight", None)
class_weight = None if (cw_nm is None or cw_nm.lower()=="none") else ("balanced" if cw_nm.lower()=="balanced" else cw_nm)

calib_method = _get_str(KATIB_PARAM_MAP, "calib_method", "isotonic").lower()
if calib_method not in {"isotonic","sigmoid"}:
    calib_method = "isotonic"
calib_cv   = _get_int(KATIB_PARAM_MAP, "calib_cv", 5)

t_lo    = _get_float(KATIB_PARAM_MAP, "tune_t_lo", 0.2)
t_hi    = _get_float(KATIB_PARAM_MAP, "tune_t_hi", 0.8)
t_steps = _get_int (KATIB_PARAM_MAP, "tune_t_steps", 61)
if t_lo is not None and t_hi is not None and t_lo > t_hi:
    t_lo, t_hi = t_hi, t_lo

min_prelogin_events_k = _get_int(KATIB_PARAM_MAP, "min_prelogin_events", None)
base_min = globals().get("MIN_PRELOGIN_EVENTS", None)
min_events_used = min_prelogin_events_k if min_prelogin_events_k is not None else base_min

# ---- (필요시) 피처 재구성 및 학습/검증 분할 보장 ----
def _norm(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip()
    return s.apply(lambda x: unicodedata.normalize("NFKC", x) if pd.notna(x) else x)

def _rebuild_features_with_min(df_raw: pd.DataFrame, min_events: int):
    df2 = df_raw.copy()
    df2["ts"] = pd.to_datetime(df2["timestamp"], errors="coerce")
    df2 = df2.dropna(subset=["session_id","ts"]).copy()
    sort_cols = ["session_id","ts"] + (["event_id"] if "event_id" in df2.columns else [])
    df2 = df2.sort_values(sort_cols).reset_index(drop=True)

    uid_str  = df2["user_id"].astype("string").str.strip()
    anon_like = {"", "0", "-1", "None", "none", "NULL", "null", "NaN", "nan"}
    has_uid = uid_str.notna() & ~uid_str.isin(anon_like)
    appeared_local = has_uid.groupby(df2["session_id"]).cummax()
    pre_local = df2.loc[~appeared_local].copy()

    pre_cnt = pre_local.groupby("session_id").size()
    valid_sessions = pre_cnt.index[pre_cnt >= int(min_events)]
    pre_local = pre_local[pre_local["session_id"].isin(valid_sessions)].copy()

    df_gender = df2.copy()
    df_gender["gender_norm"] = (
        df_gender.get("gender", pd.Series(index=df_gender.index, dtype="object"))
                 .astype("string").str.strip().str.upper()
                 .replace({"FEMALE":"F","MALE":"M"})
    )
    lab_full = (
        df_gender[df_gender["gender_norm"].isin(["M","F"])]
          .groupby("session_id")["gender_norm"].agg(lambda s: s.iloc[0])
    )

    agg_num = pre_local.groupby("session_id").agg(
        n_events=("session_id","size"),
        search_count_sum=("search_count","sum") if "search_count" in pre_local.columns else ("session_id","size"),
        cart_item_count_sum=("cart_item_count","sum") if "cart_item_count" in pre_local.columns else ("session_id","size"),
        page_depth_mean=("page_depth","mean") if "page_depth" in pre_local.columns else ("session_id","size"),
        last_elapsed_mean=("last_action_elapsed","mean") if "last_action_elapsed" in pre_local.columns else ("session_id","size"),
        unique_pages=("current_state","nunique") if "current_state" in pre_local.columns else ("session_id","size"),
        unique_categories=("resolved_category","nunique") if "resolved_category" in pre_local.columns else ("session_id","size"),
    ).fillna(0.0)
    first_ts = pre_local.groupby("session_id")["ts"].min()
    agg_num["start_hour"] = first_ts.dt.hour
    agg_num["start_weekday"] = first_ts.dt.weekday

    KNOWN_CATS_LOCAL = globals().get("KNOWN_CATS", ["Books","Electronics","Gaming","Home","Fashion"])
    KNOWN_CATS_NORM = [unicodedata.normalize("NFKC", c.strip()) for c in KNOWN_CATS_LOCAL]
    norm_to_orig = {unicodedata.normalize("NFKC", c.strip()): c for c in KNOWN_CATS_LOCAL}
    pre_cat_norm = _norm(pre_local.get("resolved_category", pd.Series(index=pre_local.index, dtype="object")))
    mask = pre_cat_norm.isin(KNOWN_CATS_NORM)
    pre_kept = pre_local[mask].copy()
    pre_kept["cat_norm"] = pre_cat_norm[mask].values
    pre_kept["one"] = 1
    cat_cnt = pre_kept.pivot_table(index="session_id", columns="cat_norm", values="one", aggfunc="sum", fill_value=0)
    cat_cnt = cat_cnt.reindex(columns=KNOWN_CATS_NORM, fill_value=0)
    cat_cnt.columns = [norm_to_orig[c] for c in cat_cnt.columns]
    cat_cnt.columns = [f"cat_cnt::{c}" for c in cat_cnt.columns]
    cat_cnt = cat_cnt.reindex(agg_num.index).fillna(0).astype(int)

    cat_prop = cat_cnt.div(agg_num["n_events"].replace(0,1), axis=0)
    cat_prop.columns = [c.replace("cat_cnt::","cat_prop::") for c in cat_cnt.columns]
    cat_log = np.log1p(cat_cnt)
    cat_log.columns = [c.replace("cat_cnt::","cat_log::") for c in cat_cnt.columns]

    X = (agg_num.join(cat_cnt, how="left").join(cat_prop, how="left").join(cat_log, how="left")).fillna(0)
    keep_sessions = X.index.intersection(lab_full.index)
    X = X.loc[keep_sessions].copy()
    y = lab_full.loc[keep_sessions].rename("gender").copy()

    X_ = X.copy(); X_.index.name = "session_id"
    dataset_local = X_.join(y).reset_index()
    dataset_local["gender"] = (
        dataset_local["gender"].astype("string").str.strip().str.upper()
        .replace({"FEMALE":"F","MALE":"M"})
    )
    non_feature = {"session_id","gender"}
    feature_cols = [c for c in dataset_local.columns if c not in non_feature]
    X_all = dataset_local[feature_cols].copy()
    y_bin = dataset_local["gender"].map({"F":0,"M":1}).astype(int)

    RAND = globals().get("RANDOM_STATE", 42)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RAND)
    train_idx_local, valid_idx_local = next(sss.split(X_all, y_bin))
    return {
        "dataset": dataset_local,
        "X_train": X_all.iloc[train_idx_local], "X_valid": X_all.iloc[valid_idx_local],
        "y_train": y_bin.iloc[train_idx_local], "y_valid": y_bin.iloc[valid_idx_local],
    }

need_rebuild = (
    min_prelogin_events_k is not None and
    "MIN_PRELOGIN_EVENTS" in globals() and
    int(min_prelogin_events_k) != int(MIN_PRELOGIN_EVENTS)
)
if need_rebuild:
    assert "df" in globals(), "원본 df가 필요합니다(df)."
    rebuilt = _rebuild_features_with_min(df, int(min_prelogin_events_k))
    dataset = rebuilt["dataset"]
    X_train, X_valid = rebuilt["X_train"], rebuilt["X_valid"]
    y_train, y_valid = rebuilt["y_train"], rebuilt["y_valid"]
else:
    if not all(v in globals() for v in ["X_train","X_valid","y_train","y_valid"]):
        assert "dataset" in globals(), "dataset이 필요합니다."
        non_feature = {"session_id","gender"}
        feature_cols = [c for c in dataset.columns if c not in non_feature]
        X_all = dataset[feature_cols].copy()
        y_bin = dataset["gender"].map({"F":0,"M":1}).astype(int)
        RAND = globals().get("RANDOM_STATE", 42)
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RAND)
        tr_idx, va_idx = next(sss.split(X_all, y_bin))
        X_train, X_valid = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        y_train, y_valid = y_bin.iloc[tr_idx], y_bin.iloc[va_idx]

# ---- 반복 학습 설정 ----
RANDOM_STATE = globals().get("RANDOM_STATE", 42)
ScalerClass  = StandardScaler if scaler_nm == "standard" else RobustScaler

clf_kwargs = dict(solver="saga", penalty=penalty, C=float(C), max_iter=5000, random_state=RANDOM_STATE)
if penalty == "elasticnet" and l1_ratio is not None:
    clf_kwargs["l1_ratio"] = float(l1_ratio)
if class_weight is not None:
    clf_kwargs["class_weight"] = class_weight

# 반복 파라미터(전역에서 덮어쓸 수 있음)
N_REPEATS = 20
PATIENCE = 10
BOOTSTRAP = True

from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import numpy as np
from copy import deepcopy

def _make_scaler():
    if ScalerClass is StandardScaler:
        return ScalerClass(with_mean=True, with_std=True)
    else:
        return ScalerClass(with_centering=True, with_scaling=True)

def _build_calibrated_model():
    pipe = Pipeline([("scaler", _make_scaler()), ("clf", LogisticRegression(**clf_kwargs))])
    try:
        cal = CalibratedClassifierCV(estimator=pipe, method=calib_method, cv=calib_cv)
    except TypeError:
        cal = CalibratedClassifierCV(base_estimator=pipe, method=calib_method, cv=calib_cv)
    return cal

def _bootstrap_xy(X, y, seed):
    if not BOOTSTRAP:
        return X, y
    rs = np.random.RandomState(seed)
    idx = rs.choice(len(X), size=len(X), replace=True)
    return X.iloc[idx], y.iloc[idx]

def _tune_threshold_from_proba(y_true, proba, lo=0.2, hi=0.8, steps=61):
    ths = np.linspace(lo, hi, steps)
    y_true = np.asarray(y_true).astype(int)
    best_t, best_f1 = 0.5, -1.0
    for t in ths:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, y_pred, average="macro")
        if f1 > best_f1:
            best_t, best_f1 = float(t), float(f1)
    acc = (y_true == (proba >= best_t).astype(int)).mean()
    return best_t, best_f1, float(acc)

class AvgEnsemble:
    """여러 Calibrated 모델의 양성 확률을 평균해 predict_proba 반환 (sklearn 호환)"""
    def __init__(self, models):
        self.models = models
    def predict_proba(self, X):
        probs = [m.predict_proba(X)[:, 1] for m in self.models]
        mean_p1 = np.mean(np.vstack(probs), axis=0)
        return np.c_[1.0 - mean_p1, mean_p1]

# ---- 반복 학습 루프 ----
rng_master = np.random.RandomState(RANDOM_STATE)
models = []
probas_valid_each = []
best_score = -1.0
no_improve = 0
best_state = {"n_models": 0, "threshold": 0.5, "f1": -1.0, "acc": -1.0}

for r in range(N_REPEATS):
    seed = int(rng_master.randint(0, 10_000_000))
    X_tr, y_tr = _bootstrap_xy(X_train, y_train, seed)

    cal_model = _build_calibrated_model()
    cal_model.fit(X_tr, y_tr)

    p_valid = cal_model.predict_proba(X_valid)[:, 1]
    t1, f1_1, acc_1 = _tune_threshold_from_proba(y_valid, p_valid, lo=t_lo or 0.2, hi=t_hi or 0.8, steps=t_steps or 61)
    print(f"[{r+1:02d}/{N_REPEATS}] Single  F1={f1_1:.4f}  ACC={acc_1:.4f}  (t*={t1:.3f})")

    models.append(deepcopy(cal_model))
    probas_valid_each.append(p_valid)

    ens_proba = np.mean(np.vstack(probas_valid_each), axis=0)
    tE, f1_E, acc_E = _tune_threshold_from_proba(y_valid, ens_proba, lo=t_lo or 0.2, hi=t_hi or 0.8, steps=t_steps or 61)
    print(f"             Ensemble({len(models)}) F1={f1_E:.4f}  ACC={acc_E:.4f}  (t*={tE:.3f})")

    if f1_E > best_score:
        best_score = f1_E
        best_state = {"n_models": len(models), "threshold": tE, "f1": f1_E, "acc": acc_E}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print(f"Early stop: no improvement for {PATIENCE} rounds.")
            break

# ---- 최종 객체 바인딩 (다음 셀 호환)
# log_katib: 마지막 학습된 단일 모델(덤프 용)
log_katib = models[best_state["n_models"] - 1] if best_state["n_models"] >= 1 else models[-1]
# cal_katib: 앙상블 래퍼 (Cell 11/12가 그대로 사용)
cal_katib = AvgEnsemble(models[:best_state["n_models"]]) if best_state["n_models"] >= 1 else AvgEnsemble(models)

print(f"\n[OK] 반복 학습 완료 → min_prelogin_events={min_events_used}, scaler={scaler_nm}, "
      f"penalty={penalty}, C={C}, class_weight={class_weight}, calibration=({calib_method}, cv={calib_cv})")
print(f"[BEST] Ensemble k={best_state['n_models']}  macro-F1={best_state['f1']:.4f}  acc={best_state['acc']:.4f}  "
      f"(t*={best_state['threshold']:.3f}, repeats={N_REPEATS}, bootstrap={BOOTSTRAP}, patience={PATIENCE})")


## 11) Katib 모델 평가 밎 저장
- 재학습한 모델을 평가하고 저장합니다.

In [None]:
# Cell 11

# 필수 객체 확인
assert all(k in globals() for k in [
    "tune_threshold","evaluate","print_scores",
    "X_valid","y_valid","dataset","df","ARTIFACT_DIR",
    "cal_katib","log_katib"  # log_katib도 저장/메타 위해 필요
]), \
    "필요 변수/함수(tune_threshold/evaluate/print_scores, X_valid/y_valid/dataset/df/ARTIFACT_DIR/cal_katib/log_katib)가 필요합니다."

# ----- 헬퍼들 -----
def _unwrap_pipeline(obj):
    """Pipeline(또는 그 안의 Pipeline)을 찾아 반환. 없으면 None."""
    # 1) 이미 Pipeline
    if hasattr(obj, "named_steps"):
        return obj
    # 2) CalibratedClassifierCV(estimator|base_estimator=Pipeline)
    for attr in ("estimator", "base_estimator"):
        inner = getattr(obj, attr, None)
        if inner is not None and hasattr(inner, "named_steps"):
            return inner
    # 3) CalibratedClassifierCV 내부 fold에서 접근
    if hasattr(obj, "calibrated_classifiers_") and obj.calibrated_classifiers_:
        try:
            inner = obj.calibrated_classifiers_[0].classifier
            if hasattr(inner, "named_steps"):
                return inner
            base = getattr(inner, "base_estimator", None)
            if base is not None and hasattr(base, "named_steps"):
                return base
        except Exception:
            pass
    return None

def _to_jsonable(v):
    if isinstance(v, (str, int, float, bool)) or v is None:
        return v
    if hasattr(v, "get_params"):
        try:
            return {"name": v.__class__.__name__, "params": v.get_params(deep=False)}
        except Exception:
            return {"name": v.__class__.__name__}
    return str(v)

def _safe_dump(obj, path):
    """joblib.dump를 안전하게 수행(실패 시 False 반환)."""
    from joblib import dump
    try:
        dump(obj, path)
        return True
    except Exception as e:
        print(f"[WARN] dump 실패: {path} ({type(obj).__name__}) → {e}")
        return False

# ----- 임계값 탐색 범위 -----
t_lo    = float(globals().get("t_lo", 0.2))
t_hi    = float(globals().get("t_hi", 0.8))
t_steps = int(globals().get("t_steps", 61))
if t_lo > t_hi:
    t_lo, t_hi = t_hi, t_lo

# ----- cal_katib(AvgEnsemble 포함)으로 최적 임계값 재탐색 → 평가/출력 -----
best_t, best_f1, _ = tune_threshold(cal_katib, X_valid, y_valid, search=(t_lo, t_hi, t_steps))
print(f"[Katib HP/Ensemble] Best macro-F1 within [{t_lo:.3f}, {t_hi:.3f}] (steps={t_steps}): t={best_t:.3f}, F1={best_f1:.4f}")

_ , _ = evaluate(
    cal_katib, X_valid, y_valid,
    threshold=best_t,
    name=f"KatibCal/Ensemble (best_t={best_t:.3f})"
)

# 앙상블/단일 모두 호환되는 확률 얻기
if hasattr(cal_katib, "predict_proba"):
    proba_k = cal_katib.predict_proba(X_valid)[:, 1]
else:
    # predict_proba가 없다면 evaluate 내부에서 y_hat만 계산되므로, 여기선 불가
    raise RuntimeError("cal_katib이 predict_proba를 제공하지 않습니다.")

print_scores(f"KatibCal/Ensemble (best@{best_t:.3f})", y_valid, proba_k, best_t)

# ----- 결과 테이블 구성 -----
if "session_id" in dataset.columns:
    sess_ids = dataset.loc[X_valid.index, "session_id"].to_numpy()
else:
    sess_ids = np.arange(len(X_valid))

pred_bin = (proba_k >= best_t).astype(int)
pred_col = f"y_pred@{best_t:.3f}"

out_df = pd.DataFrame({
    "session_id": sess_ids,
    "y_true": y_valid.values,
    pred_col: pred_bin,
    "proba_cal": proba_k,
})

# 추가 컬럼(age/age_group) 병합
extra_cols = [c for c in ["age", "age_group"] if c in dataset.columns]
if extra_cols:
    out_df = out_df.merge(
        dataset[["session_id"] + extra_cols].drop_duplicates("session_id"),
        on="session_id", how="left"
    )

# 로그인 이후(post)에서 age를 세션별 대표값으로 추출해 그룹 생성 (age_group 없을 때)
uid_str  = df.get("user_id", pd.Series([""] * len(df))).astype("string").str.strip()
anon_like = {"", "0", "-1", "None", "none", "NULL", "null", "NaN", "nan"}
has_uid = uid_str.notna() & ~uid_str.isin(anon_like)
appeared_local = has_uid.groupby(df["session_id"]).cummax()
post = df.loc[appeared_local].copy()

metrics_df = out_df.copy()
if "age_group" not in metrics_df.columns and "age" in post.columns:
    age_post_by_sess = (
        pd.to_numeric(post["age"], errors="coerce")
          .groupby(post["session_id"])
          .apply(lambda s: s.dropna().iloc[0] if len(s.dropna()) else np.nan)
          .rename("age")
    )
    age_map = age_post_by_sess.to_frame()

    def _to_age_bucket(x):
        if pd.isna(x): return np.nan
        x = float(x)
        return "young" if x < 25 else ("middle" if x < 50 else "old")

    age_map["age_group"] = age_map["age"].apply(_to_age_bucket)
    metrics_df = metrics_df.merge(age_map.reset_index(), on="session_id", how="left")

metrics_df["age_group"] = metrics_df.get("age_group").astype("string").fillna("unknown")

rows = []
for g, sub in metrics_df.groupby("age_group", dropna=False):
    y_true = sub["y_true"].astype(int).to_numpy()
    y_hat  = sub[pred_col].astype(int).to_numpy()
    acc = float((y_true == y_hat).mean())
    f1  = float(f1_score(y_true, y_hat, average="macro")) if len(np.unique(y_true)) > 1 else np.nan
    rows.append({"age_group": str(g), "acc": acc, "macro_f1": f1})

scores_by_age = pd.DataFrame(rows).sort_values("age_group")
scores_by_age[["acc","macro_f1"]] = scores_by_age[["acc","macro_f1"]].round(4)
print("\n[Age-group metrics — Katib/Ensemble @ best_t]")
print(scores_by_age[["age_group","acc","macro_f1"]].to_string(index=False))

# ----- 아티팩트 저장 -----
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

# 1) 기본(참조) 모델 저장: log_katib
#    log_katib이 Pipeline이 아닐 수 있으므로 언랩 후 저장 시도
log_ref = _unwrap_pipeline(log_katib) or log_katib
saved_model_ok = _safe_dump(log_ref, os.path.join(ARTIFACT_DIR, "model.joblib"))

# 2) 보정/앙상블 모델 저장
ensemble_info = None
if hasattr(cal_katib, "models"):  # AvgEnsemble
    # 멤버별로 저장 + 메타만 기록(래퍼 직렬화 실패 가능성 고려)
    ens_dir = Path(ARTIFACT_DIR) / "ensemble"
    ens_dir.mkdir(parents=True, exist_ok=True)
    member_paths = []
    for i, m in enumerate(cal_katib.models, 1):
        p = ens_dir / f"member_{i:02d}.joblib"
        _safe_dump(m, p)
        member_paths.append(str(p))
    ensemble_info = {
        "type": "AvgEnsemble",
        "k": len(cal_katib.models),
        "members": member_paths
    }
    # 래퍼 자체 저장도 시도(성공하면 그대로 사용 가능)
    _safe_dump(cal_katib, os.path.join(ARTIFACT_DIR, "model_calibrated.joblib"))
else:
    # 단일 CalibratedClassifierCV 또는 Pipeline이면 그대로 저장
    _safe_dump(cal_katib, os.path.join(ARTIFACT_DIR, "model_calibrated.joblib"))

# 3) 메타 정보 저장
#    - best_params_jsonable: 가능한 경우 내부 Pipeline의 scaler/clf 파라미터를 기록
pipe_for_meta = _unwrap_pipeline(log_ref)
if pipe_for_meta is not None:
    best_params_jsonable = {
        "scaler": _to_jsonable(pipe_for_meta.named_steps["scaler"]),
        "clf": _to_jsonable(pipe_for_meta.named_steps["clf"]),
    }
else:
    # 파이프라인이 아니면 최소한의 정보만 기록
    best_params_jsonable = {
        "model_repr": str(type(log_ref)),
        "model_params": _to_jsonable(getattr(log_ref, "get_params", lambda: {})())
    }

calib_method_meta = globals().get("calib_method", "isotonic")
calib_cv_meta     = int(globals().get("calib_cv", 5))

katib_param_map_meta = globals().get("KATIB_PARAM_MAP", {})
with open(os.path.join(ARTIFACT_DIR, "model_meta.json"), "w", encoding="utf-8") as f:
    json.dump({
        "source": "katib",
        "best_params": best_params_jsonable,
        "best_cv_f1_macro": None,
        "production_threshold": float(best_t),
        "calibration": {"method": calib_method_meta, "cv": calib_cv_meta},
        "threshold_tuning": {"lo": float(t_lo), "hi": float(t_hi), "steps": int(t_steps)},
        "katib_param_map": katib_param_map_meta,
        "ensemble": ensemble_info
    }, f, ensure_ascii=False, indent=2)

print(f"\n[Saved/Overwritten] model.joblib (ok={saved_model_ok}), model_calibrated.joblib, model_meta.json → {ARTIFACT_DIR}")

# 후속 셀 호환을 위해 유지
valid_idx = X_valid.index


## 12) 성별 추론 결과값 시각화
- Katib 하이퍼파라미터로 추론한 결과값을 그래프로 시각화합니다.

In [None]:
# Cell 12

from sklearn.metrics import f1_score

# 필수 값 체크
assert all(k in globals() for k in ["best_t", "proba_k", "y_valid", "scores_by_age"]), \
    "Cell 11을 먼저 실행해 best_t, proba_k, y_valid, scores_by_age를 준비하세요."

t = float(best_t)
y_true = np.asarray(y_valid).astype(int)
y_prob = np.asarray(proba_k).astype(float)
y_pred = (y_prob >= t).astype(int)

# total 성능
total_f1 = f1_score(y_true, y_pred, average="macro")
total_acc = (y_true == y_pred).mean()

# age-group 성능 (young, middle, old 사용)
labels = ["total", "young", "middle", "old"]
f1_vals = [total_f1]
acc_vals = [total_acc]

sba = scores_by_age.set_index("age_group")
for g in ["young", "middle", "old"]:
    if g in sba.index:
        f1_vals.append(float(sba.loc[g, "macro_f1"]))
        acc_vals.append(float(sba.loc[g, "acc"]))
    else:
        f1_vals.append(np.nan)
        acc_vals.append(np.nan)

# 막대그래프
x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, f1_vals, width, label="F1 (macro)", color="green")
rects2 = ax.bar(x + width/2, acc_vals, width, label="Accuracy", color="blue")

ax.set_ylim(0, 1.0)
ax.set_xticks(x, labels)
ax.set_ylabel("Score")
ax.set_title(f"Validation Scores by Age Group @ t={t:.3f}")
ax.legend()
ax.grid(axis="y", linestyle="--", alpha=0.4)

# 막대 위에 수치 표시
def autolabel(rects):
    for r in rects:
        h = r.get_height()
        if np.isnan(h):
            continue
        ax.annotate(f"{h:.4f}",
                    xy=(r.get_x() + r.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points",
                    ha="center", va="bottom", fontsize=9)

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.show()
