# 성별 추론

사용자의 로그인 전 행동 패턴을 보고 사용자의 성별을 추론하는 실습입니다.

## 1) 공통값 설정

In [None]:
import os, json, warnings, unicodedata
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score
)
from sklearn.calibration import CalibratedClassifierCV
from joblib import dump

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# ---------- 경로/상수 ----------
DATASETS_DIR  = Path("datasets") / "gender" 
INPUT_PATH    = "datasets/gender/processed_user_behavior.joined.csv"
OUTPUT_DIR    = DATASETS_DIR                              
ARTIFACT_DIR  = Path("models") / "gender"     

DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

MIN_PRELOGIN_EVENTS = 2

KNOWN_CATS = ["Books", "Electronics", "Gaming", "Home", "Fashion"]

RANDOM_STATE = 42
PRODUCTION_THRESHOLD = 0.444

def norm(s: pd.Series) -> pd.Series:
    """공백 제거 + 유니코드 NFKC 정규화(NA 안전)"""
    s = s.astype("string").str.strip()
    return s.apply(lambda x: unicodedata.normalize("NFKC", x) if pd.notna(x) else x)


## 2) 원본 데이터 로드 & 로그인 전 구간 추출

In [None]:
df = pd.read_csv(INPUT_PATH, low_memory=False)

df["ts"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["session_id", "ts"]).copy()

sort_cols = ["session_id", "ts"]
if "event_id" in df.columns:
    sort_cols.append("event_id")
df = df.sort_values(sort_cols).reset_index(drop=True)

print("원본 행 수:", len(df))

uid_str  = df["user_id"].astype("string").str.strip()
anon_like = {"", "0", "-1", "None", "none", "NULL", "null", "NaN", "nan"}
has_uid = uid_str.notna() & ~uid_str.isin(anon_like)

appeared = has_uid.groupby(df["session_id"]).cummax()
pre = df.loc[~appeared].copy()

prelogin_counts = pre.groupby("session_id").size()
valid_sessions = prelogin_counts.index[prelogin_counts >= MIN_PRELOGIN_EVENTS]
pre = pre[pre["session_id"].isin(valid_sessions)].copy()

print("프리로그인 필터 후 행 수:", len(pre))
print("유지된 프리로그인 세션 수:", pre["session_id"].nunique())


## 3) Target Label 생성 & 로그인 전 구간 수치 집계

In [None]:
df_gender = df.copy()
df_gender["gender_norm"] = (
    df_gender["gender"]
      .astype("string").str.strip().str.upper()
      .replace({"FEMALE":"F", "MALE":"M"})
)
lab_full = (
    df_gender[df_gender["gender_norm"].isin(["M","F"])]
      .groupby("session_id")["gender_norm"].agg(lambda s: s.iloc[0])
)

agg_num = pre.groupby("session_id").agg(
    n_events=("session_id", "size"),
    search_count_sum=("search_count", "sum"),
    cart_item_count_sum=("cart_item_count", "sum"),
    page_depth_mean=("page_depth", "mean"),
    last_elapsed_mean=("last_action_elapsed", "mean"),
    unique_pages=("current_state", "nunique"),
    unique_categories=("resolved_category", "nunique"),
).fillna(0.0)

first_ts = pre.groupby("session_id")["ts"].min()
agg_num["start_hour"] = first_ts.dt.hour
agg_num["start_weekday"] = first_ts.dt.weekday

keep_sessions = agg_num.index.intersection(lab_full.index)
X_num = agg_num.loc[keep_sessions].copy()
y = lab_full.loc[keep_sessions].rename("gender").copy()

print("교집합 세션 수:", len(keep_sessions))
print("라벨 분포:\n", y.value_counts(dropna=False))


## 4) 카테고리 카운트 & 파생 컬럼 생성 (Feature Engineering)

In [None]:
KNOWN_CATS_NORM = [unicodedata.normalize("NFKC", c.strip()) for c in KNOWN_CATS]
norm_to_orig = {unicodedata.normalize("NFKC", c.strip()): c for c in KNOWN_CATS}

pre_cat_norm = norm(pre["resolved_category"])

mask = pre_cat_norm.isin(KNOWN_CATS_NORM)
pre_kept = pre[mask].copy()
pre_kept["cat_norm"] = pre_cat_norm[mask].values
pre_kept["one"] = 1

cat_cnt = pre_kept.pivot_table(
    index="session_id",
    columns="cat_norm",
    values="one",
    aggfunc="sum",
    fill_value=0
)

cat_cnt = cat_cnt.reindex(columns=KNOWN_CATS_NORM, fill_value=0)
cat_cnt.columns = [norm_to_orig[c] for c in cat_cnt.columns]

cat_cnt.columns = [f"cat_cnt::{c}" for c in cat_cnt.columns]
cat_cnt = cat_cnt.reindex(X_num.index).fillna(0).astype(int)

cat_cnt_cols = list(cat_cnt.columns)

cat_prop = cat_cnt.div(X_num["n_events"].replace(0, 1), axis=0)
cat_prop.columns = [c.replace("cat_cnt::", "cat_prop::") for c in cat_cnt_cols]

cat_log = np.log1p(cat_cnt)
cat_log.columns = [c.replace("cat_cnt::", "cat_log::") for c in cat_cnt_cols]

print("카테고리 카운트 컬럼:", list(cat_cnt.columns))
print("카테고리 파생 컬럼:", list(cat_prop.columns[:3]) + list(cat_log.columns[:3]))


## 5) 파생 컬럼 결합 및 저장 (Feature Engineering)

In [None]:
X = (
    X_num
    .join(cat_cnt, how="left")
    .join(cat_prop, how="left")
    .join(cat_log, how="left")
).fillna(0)

X_ = X.copy()
if X_.index.name != "session_id":
    X_.index.name = "session_id"

dataset = X_.join(y).reset_index()

print("dataset shape:", dataset.shape)
print(dataset.head(10))

FEATURES_CSV = f"{OUTPUT_DIR}/prelogin_gender_features.csv"
FEATURES_PARQUET = f"{OUTPUT_DIR}/prelogin_gender_features.parquet"
X_only_csv = f"{OUTPUT_DIR}/prelogin_features_only.csv"
X_only_parquet = f"{OUTPUT_DIR}/prelogin_features_only.parquet"
y_only_csv = f"{OUTPUT_DIR}/prelogin_labels_only.csv"
y_only_parquet = f"{OUTPUT_DIR}/prelogin_labels_only.parquet"

dataset.to_csv(FEATURES_CSV, index=False)
dataset.to_parquet(FEATURES_PARQUET, index=False)
X_.reset_index().to_csv(X_only_csv, index=False)
X_.reset_index().to_parquet(X_only_parquet, index=False)
y.to_frame("gender").reset_index().to_csv(y_only_csv, index=False)
y.to_frame("gender").reset_index().to_parquet(y_only_parquet, index=False)

print(f"\nSaved: {FEATURES_CSV}")
print(f"Saved: {FEATURES_PARQUET}")
print("Saved (X only):", X_only_csv, "|", X_only_parquet)
print("Saved (y only):", y_only_csv, "|", y_only_parquet)


## 6) 학습용 / 검증용 데이터셋 분할

In [None]:
dataset["gender"] = (
    dataset["gender"].astype("string").str.strip().str.upper()
    .replace({"FEMALE":"F","MALE":"M"})
)

non_feature = {"session_id","gender"}
feature_cols = [c for c in dataset.columns if c not in non_feature]
X = dataset[feature_cols].copy()
y_bin = dataset["gender"].map({"F":0, "M":1}).astype(int)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, valid_idx = next(sss.split(X, y_bin))
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y_bin.iloc[train_idx], y_bin.iloc[valid_idx]

print("Train:", X_train.shape, " Valid:", X_valid.shape)
print("Label dist (train):\n", y_train.value_counts(normalize=True).rename({0:"F",1:"M"}))
print("Label dist (valid):\n", y_valid.value_counts(normalize=True).rename({0:"F",1:"M"}))


## 7) 임계값 튜닝

In [None]:
def tune_threshold(model, X_va, y_va, search=(0.2,0.8,61)):
    """Macro-F1 최대화 임계값 탐색 (predict_proba 사용 가능한 모델 전제)"""
    lo, hi, n = search
    ths = np.linspace(lo, hi, n)
    proba = model.predict_proba(X_va)[:,1]
    best_t, best_f1 = 0.5, -1
    for t in ths:
        preds = (proba >= t).astype(int)
        f1 = f1_score(y_va, preds, average="macro")
        if f1 > best_f1:
            best_t, best_f1 = float(t), float(f1)
    return best_t, best_f1, proba

def evaluate(model, X_va, y_va, threshold=0.5, name="Model"):
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_va)[:,1]
        y_hat = (proba >= threshold).astype(int)
    else:
        y_hat = model.predict(X_va)
        proba = None

    print(f"\n=== {name} @ threshold={threshold:.3f} ===")
    print(classification_report(y_va, y_hat, target_names=["F","M"], digits=4))
    print("Confusion matrix [rows=true F,M | cols=pred F,M]:\n", confusion_matrix(y_va, y_hat))
    return y_hat, proba

def print_scores(name, y_true, proba, t):
    y_pred = (proba >= t).astype(int)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    acc = (y_true.values == y_pred).mean()
    print(f"[{name}] Macro-F1 : {macro_f1:.4f} / Accuracy : {acc:.4f}")


## 8) Logistic 하이퍼파라미터 튜닝

In [None]:
CALIB_METHOD = "isotonic"
CALIB_CV = 5

log_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        solver="saga", max_iter=5000, random_state=RANDOM_STATE
    ))
])

log_grid = [
    {
        "scaler": [StandardScaler(with_mean=True, with_std=True),
                   RobustScaler(with_centering=True, with_scaling=True)],
        "clf__penalty": ["l2"],
        "clf__C": [0.2, 0.5, 1.0, 2.0, 5.0],
        "clf__class_weight": [None, "balanced"],
    },
    {
        "scaler": [StandardScaler(with_mean=True, with_std=True),
                   RobustScaler(with_centering=True, with_scaling=True)],
        "clf__penalty": ["l1"],
        "clf__C": [0.2, 0.5, 1.0, 2.0, 5.0],
        "clf__class_weight": [None, "balanced"],
    },
    {
        "scaler": [StandardScaler(with_mean=True, with_std=True),
                   RobustScaler(with_centering=True, with_scaling=True)],
        "clf__penalty": ["elasticnet"],
        "clf__l1_ratio": [0.2, 0.5, 0.8],
        "clf__C": [0.2, 0.5, 1.0, 2.0, 5.0],
        "clf__class_weight": [None, "balanced"],
    },
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

log_search = GridSearchCV(
    estimator=log_pipe,
    param_grid=log_grid,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=0,
)
log_search.fit(X_train, y_train)
log_best = log_search.best_estimator_
print("Logistic best params:", log_search.best_params_, " | cv f1_macro:", round(log_search.best_score_, 4))

try:
    cal_log = CalibratedClassifierCV(estimator=log_best, method=CALIB_METHOD, cv=CALIB_CV)
except TypeError:
    cal_log = CalibratedClassifierCV(base_estimator=log_best, method=CALIB_METHOD, cv=CALIB_CV)

cal_log.fit(X_train, y_train)
print(f"[Calibration] done: method={CALIB_METHOD}, cv={CALIB_CV}")


## 9) 학습 평가 및 저장

In [None]:
yhat_prod, _ = evaluate(
    cal_log, X_valid, y_valid,
    threshold=PRODUCTION_THRESHOLD,
    name=f"LogisticCal (prod={PRODUCTION_THRESHOLD:.3f})"
)
proba_v = cal_log.predict_proba(X_valid)[:,1]
print_scores(f"LogisticCal (prod@{PRODUCTION_THRESHOLD:.3f})", y_valid, proba_v, PRODUCTION_THRESHOLD)

dump(log_best, os.path.join(ARTIFACT_DIR, "logistic_best.joblib"))
dump(cal_log,  os.path.join(ARTIFACT_DIR, "logistic_calibrated.joblib"))

def _to_jsonable(v):
    if isinstance(v, (str, int, float, bool)) or v is None:
        return v
    if hasattr(v, "get_params"):
        return {"name": v.__class__.__name__, "params": v.get_params(deep=False)}
    return str(v)

best_params_jsonable = {k: _to_jsonable(v) for k, v in log_search.best_params_.items()}

with open(os.path.join(ARTIFACT_DIR, "logistic_meta.json"), "w", encoding="utf-8") as f:
    json.dump({
        "best_params": best_params_jsonable,
        "best_cv_f1_macro": float(log_search.best_score_),
        "production_threshold": float(PRODUCTION_THRESHOLD),
        "calibration": {"method": CALIB_METHOD, "cv": CALIB_CV}
    }, f, ensure_ascii=False, indent=2)

pred_col = f"y_pred@{PRODUCTION_THRESHOLD:.3f}"
out_df = pd.DataFrame({
    "session_id": dataset.iloc[valid_idx]["session_id"].values if "session_id" in dataset.columns else np.arange(len(valid_idx)),
    "y_true": y_valid.values,
    pred_col: (proba_v >= PRODUCTION_THRESHOLD).astype(int),
    "proba_cal": proba_v
})
out_df.to_csv(os.path.join(ARTIFACT_DIR, "logistic_valid_predictions.csv"), index=False)

print("\n[Saved] logistic_best.joblib, logistic_calibrated.joblib, logistic_meta.json, logistic_valid_predictions.csv")
print(f"[Info] production_threshold={PRODUCTION_THRESHOLD:.3f}")
