# 세션 이탈
### 사용자의 첫 활동부터 K개의 활동까지 분석하여 세션이 완료될지 중간에 이탈할지 추론하는 실습입니다.

# 0) 패키지 설치
- 실습에 필요한 패키지를 설치합니다.

In [None]:
!pip install imblearn

## 1) 환경 구성
- 실습을 원활하게 진행하기 위해 환경을 구성합니다.

In [None]:
# === Cell 1. 환경 구성 ===
import os, re, json, inspect
from collections import Counter
from itertools import tee

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump
from io import StringIO
from joblib import load

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix, average_precision_score
)
from sklearn.utils import check_random_state
from matplotlib.patches import Patch

_imb_ok = True
try:
    from imblearn.over_sampling import RandomOverSampler, SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
except Exception as e:
    print("[warn] imbalanced-learn를 불러오지 못했습니다. 오버샘플링 비활성화:", e)
    _imb_ok = False

RANDOM_STATE = 42
rng = check_random_state(RANDOM_STATE)
TOPK_BIGRAMS = 50
K_LIST       = [3, 5, 7]
TERMINALS_FINAL = {"/logout", "/delete_user"}

OPER_POLICY = "balanced" 
T_OPER_DEFAULT = 0.730
OPER_T_OVERRIDE = None

# === (신규) Epoch/재학습 노브 ===
XGB_EPOCHS = 1200              # XGBoost "epochs" == n_estimators
EARLY_STOPPING_ROUNDS = 100    # XGBoost 조기종료 라운드
LR_MAX_ITER = 8000             # LogisticRegression 반복 상한(유사-epoch)
N_RESTARTS = 3                  # 멀티 시드 재학습 횟수
SEED_SEQ = [42, 1337, 2027, 7777, 2025][:N_RESTARTS]

# 오버샘플링 옵션 - 한 쪽의 데이터가 너무 적을 시 사용
OVERSAMPLING = {
    "enable": False,          # 전역 기본값 ( True = on / False = off )
    "method": "smote",        # "random" | "smote"
    "ratio": "auto",          # 1.0=완전균형, 0.5=소수/다수=0.5, "auto"=자동균형
    "k_neighbors": 5,         # SMOTE용
    "random_state": RANDOM_STATE,
}

# K별 오버샘플링 override (전역 기본값을 덮어씀)
OVERSAMPLING_BY_K = {
    3: {"enable": False, "method": "smote", "ratio": "auto", "k_neighbors": 5}, # K=3 
    5: {"enable": True, "method": "smote", "ratio": "auto", "k_neighbors": 5}, # K=5
    7: {"enable": False, "method": "smote", "ratio": "auto", "k_neighbors": 5}, # K=7
}

def _merge_oversampling_cfg(global_cfg: dict, per_k_cfg: dict) -> dict:
    cfg = dict(global_cfg)
    if per_k_cfg:
        cfg.update(per_k_cfg)
    return cfg

def _normalize_sampling_strategy(val):
    """imblearn이 받는 sampling_strategy 형태로 정규화."""
    if callable(val):
        return val
    if isinstance(val, (int, float, np.integer, np.floating)):
        return float(val)  # 예: 0.5
    if isinstance(val, dict):
        return {k: (int(v) if isinstance(v, (np.integer,)) else int(v) if isinstance(v, bool) else v)
                for k, v in val.items()}
    if val is None:
        return "auto"
    if isinstance(val, str):
        v = val.strip().lower()
        if v in {"auto", "minority", "not minority", "not majority", "all"}:
            return v
        try:
            return float(v)
        except ValueError:
            raise ValueError(f"Invalid sampling_strategy (ratio): {val!r}")
    raise TypeError(f"Unsupported sampling_strategy type: {type(val)}")

def _build_sampler(cfg: dict):
    """오버샘플러 생성(파이프라인 내부에서만 사용)."""
    if (not cfg.get("enable")) or (not _imb_ok):
        return None
    method = str(cfg.get("method", "random")).lower()
    rs     = cfg.get("random_state", RANDOM_STATE)
    ss     = _normalize_sampling_strategy(cfg.get("ratio", "auto"))
    if method == "random":
        return RandomOverSampler(sampling_strategy=ss, random_state=rs)
    elif method == "smote":
        return SMOTE(sampling_strategy=ss, random_state=rs, k_neighbors=int(cfg.get("k_neighbors", 5)))
    else:
        print(f"[warn] 알 수 없는 오버샘플링 방법: {method}. 비활성화합니다.")
        return None

# 입력/출력 경로
INPUT_PATH = "datasets/processed_user_behavior.sorted.csv"
OUT_DIR = "datasets/sessionDrop"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_ALL = os.path.join(OUT_DIR, "sessionDrop.features.csv")
def OUT_PREFIXF_PATH(k:int) -> str:
    return os.path.join(OUT_DIR, f"sessionDrop.prefix{k}.features.filtered.csv")

MODEL_OUT_DIR = "models/sessionDrop"
os.makedirs(MODEL_OUT_DIR, exist_ok=True)
def MODEL_PATH_K(k:int) -> str:
    return os.path.join(MODEL_OUT_DIR, f"sessionDrop_model.k{k}.joblib")
def META_PATH_K(k:int) -> str:
    return os.path.join(MODEL_OUT_DIR, f"sessionDrop_model.k{k}.meta.json")

FEATURE_COLS = ["timestamp","current_state","page_depth",
                "delta_search_count","delta_cart_item_count","gap_sec"]

def make_param_dist(seed: int):
    r = check_random_state(seed)
    return {
        "n_estimators":      r.randint(600, 1601, size=20),
        "learning_rate":     r.choice([0.02, 0.03, 0.05, 0.07, 0.1], size=20),
        "max_depth":         r.choice([3,4,5,6,7], size=20),
        "min_child_weight":  r.choice([1,2,3,5], size=20),
        "subsample":         r.uniform(0.6, 1.0, size=20),
        "colsample_bytree":  r.uniform(0.6, 1.0, size=20),
        "reg_lambda":        r.choice([0.5,1.0,1.5,2.0], size=20),
        "reg_alpha":         r.choice([0.0,0.1,0.3,0.5], size=20),
    }

print("[Done]")


## 2) 원본 데이터 로드
- 원본 데이터를 로드하고 전체 세션의 개수를 집계합니다.

In [None]:
# === Cell 2. 데이터 로드 ===
df = pd.read_csv(INPUT_PATH)

required = ["session_id","timestamp","current_state","next_state","page_depth","search_count","cart_item_count"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"필수 컬럼 누락: {missing}")

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["session_id","timestamp"]).copy()
df["session_id"] = df["session_id"].astype(str)
df = df.sort_values(["session_id","timestamp"])

def norm_state(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = re.sub(r"\?.*$","", s)
    if len(s)>1 and s.endswith("/"):
        s = s[:-1]
    return s

df["current_state"] = df["current_state"].map(norm_state)
df["next_state"]    = df["next_state"].map(norm_state)

print("Loaded rows:", len(df), "sessions:", df["session_id"].nunique())
print("[Done]")

## 3) 간격/증분 계산
- 세션 내 이벤트 간 시간차와 누적 지표의 증가분을 만들어 이후 피처 집계에 쓰기 좋게 정규화합니다.

In [None]:
# === Cell 3. 간격/증분 생성 ===
df["gap_sec"] = (
    df.groupby("session_id")["timestamp"]
      .diff()
      .dt.total_seconds()
      .fillna(0)
      .clip(lower=0)
)

for col in ["page_depth","search_count","cart_item_count"]:
    dcol = f"delta_{col}"
    df[dcol] = df.groupby("session_id")[col].diff()
    df[dcol] = df[dcol].fillna(df[col]).clip(lower=0)

print("[Done]")

## 4) Targit Label 생성 & 누수 세션 식별
- 실제 결과인 Label을 생성하고, 모델 학습을 위해 첫 활동부터 K 활동 내에 실제 결과 값이 포함된 세션을 식별합니다.

In [None]:
# === Cell 4. 세션 라벨 + prefix-K 및 누수 세션 식별 ===
is_final = df["current_state"].isin(TERMINALS_FINAL) | df["next_state"].isin(TERMINALS_FINAL)
labels = (
    df.assign(is_final=is_final)
      .groupby("session_id", sort=False)["is_final"]
      .any()
      .astype("int8")
      .reset_index(name="label")
)

prefix_by_k    = {}
term_sid_by_k  = {}

# 🔧 누수 처리 모드: 'drop' | 'truncate' | 'mask' | 'mask_pad'
LEAK_MODE  = "mask_pad"
PAD_TOKEN  = ""     # 패딩/마스킹에 쓸 중립 토큰(빈 문자열이면 대부분의 피처에 영향 최소)
PAD_DEPTH  = None   # None=이전값 유지(ffill), 숫자 주면 고정값으로 세팅(예: 0)

def _sanitize_prefix_group(g: pd.DataFrame, K: int) -> pd.DataFrame:
    """prefix-K 그룹에서 터미널 등장 이벤트를 누수 방지 처리."""
    mask = g["current_state"].isin(TERMINALS_FINAL) | g["next_state"].isin(TERMINALS_FINAL)
    if not mask.any():
        # 누수 없음
        if LEAK_MODE.endswith("pad") and len(g) < K:
            # 패딩만 필요하면 패딩
            last = g.iloc[-1]
            while len(g) < K:
                pad = {
                    "session_id": last["session_id"],
                    "timestamp":  last["timestamp"],      # 시간 증가 없음
                    "current_state": PAD_TOKEN,
                    "next_state":    PAD_TOKEN,
                    "page_depth":    last["page_depth"] if PAD_DEPTH is None else PAD_DEPTH,
                    "delta_search_count": 0.0,
                    "delta_cart_item_count": 0.0,
                    "gap_sec": 0.0,
                }
                g = pd.concat([g, pd.DataFrame([pad])], ignore_index=True)
        return g

    if LEAK_MODE == "drop":
        # 터미널이 있는 행 제거(세션이 비면 downstream에서 자연 탈락)
        g2 = g.loc[~mask].copy()
        return g2

    if LEAK_MODE == "truncate":
        # 첫 터미널 등장 이전까지만 사용
        first_idx = mask.idxmax()  # True가 처음 나온 index
        g2 = g.loc[g.index < first_idx].copy()
        return g2

    # === 'mask' / 'mask_pad' 공통: 터미널 등장 행을 '중립화' ===
    g2 = g.copy()

    # 상태 토큰 마스킹
    g2.loc[mask, "current_state"] = PAD_TOKEN
    g2.loc[mask, "next_state"]    = PAD_TOKEN

    # 델타/간격은 0으로(동작 최소화)
    g2.loc[mask, ["delta_search_count", "delta_cart_item_count", "gap_sec"]] = 0.0

    # page_depth는 이전값 유지(ffill) 또는 고정값
    if PAD_DEPTH is None:
        g2["page_depth"] = g2["page_depth"].ffill()
        g2.loc[mask & g2["page_depth"].isna(), "page_depth"] = 0.0
    else:
        g2.loc[mask, "page_depth"] = float(PAD_DEPTH)

    # 시간 신호가 새어 나가지 않도록, 마스킹된 행의 timestamp는 직전과 동일하게
    g2.loc[mask, "timestamp"] = g2["timestamp"].shift(1).loc[mask].fillna(g2["timestamp"].iloc[0])

    # 필요 시 K까지 패딩
    if LEAK_MODE == "mask_pad" and len(g2) < K:
        last = g2.iloc[-1]
        while len(g2) < K:
            pad = {
                "session_id": last["session_id"],
                "timestamp":  last["timestamp"],
                "current_state": PAD_TOKEN,
                "next_state":    PAD_TOKEN,
                "page_depth":    last["page_depth"] if PAD_DEPTH is None else PAD_DEPTH,
                "delta_search_count": 0.0,
                "delta_cart_item_count": 0.0,
                "gap_sec": 0.0,
            }
            g2 = pd.concat([g2, pd.DataFrame([pad])], ignore_index=True)

    return g2

for K in K_LIST:
    # 원본 prefix-K
    prefix_k = (
        df.sort_values(["session_id","timestamp"])
          .groupby("session_id", group_keys=False)
          .head(K)
    )

    # 참고용(통계 출력 용도로만 유지)
    term_sid_k = prefix_k.loc[
        prefix_k["current_state"].isin(TERMINALS_FINAL) | prefix_k["next_state"].isin(TERMINALS_FINAL),
        "session_id"
    ].unique()

    # ✅ 누수 처리 적용된 prefix-K
    prefix_clean_k = pd.concat(
        (_sanitize_prefix_group(g.copy(), K) for _, g in prefix_k.groupby("session_id", sort=False)),
        ignore_index=True
    )

    prefix_by_k[K]   = prefix_clean_k
    term_sid_by_k[K] = term_sid_k  # 통계 출력용 (계속 사용해도 됨)

print(f"[info] total sessions: {df['session_id'].nunique():,}")
for K in K_LIST:
    print(f"[info] K={K}: sessions reaching terminals in first {K} events (detected): {len(term_sid_by_k[K]):,}")

def bigrams(seq):
    a, b = tee(seq)
    next(b, None)
    for x, y in zip(a, b):
        yield (x, y)

bg_counter = Counter()
TERMINALS = TERMINALS_FINAL
for _, g in df.groupby("session_id", sort=False):
    seq = list(g["current_state"].fillna("").values)
    for bg in bigrams(seq):
        if (bg[0] in TERMINALS) or (bg[1] in TERMINALS):
            continue
        bg_counter[bg] += 1

top_bigrams = [bg for bg, _ in bg_counter.most_common(TOPK_BIGRAMS)]
bigram_to_idx = {bg:i for i,bg in enumerate(top_bigrams)}

print("[Done]")

## 5) Feature Engineering
- 모델을 학습하기 위해 데이터를 전처리합니다.

In [None]:
# === Cell 5. 세션 피처 집계 함수 ===
from IPython.display import display

def agg_features(g: pd.DataFrame) -> pd.Series:
    g = g.sort_values("timestamp")
    states = g["current_state"].fillna("").tolist()
    gaps   = g["gap_sec"].values

    feats = {}
    feats["n_events"] = len(g)
    feats["session_duration_sec"] = (
        (g["timestamp"].iloc[-1] - g["timestamp"].iloc[0]).total_seconds()
        if len(g)>1 else 0.0
    )
    feats["events_per_min"] = feats["n_events"] / max(feats["session_duration_sec"]/60.0, 1e-9)

    if len(gaps)>0:
        feats["gap_mean"] = float(np.mean(gaps))
        feats["gap_std"]  = float(np.std(gaps))
        feats["gap_p95"]  = float(np.quantile(gaps, 0.95))
    else:
        feats["gap_mean"]=feats["gap_std"]=feats["gap_p95"]=0.0

    feats["page_depth_max"]   = float(g["page_depth"].max())
    feats["page_depth_slope"] = float((g["page_depth"].iloc[-1] - g["page_depth"].iloc[0]) / max(len(g)-1,1))
    feats["depth_backtracks"] = int((g["page_depth"].diff()<0).sum())

    feats["search_delta_sum"] = float(g["delta_search_count"].sum())
    feats["search_rate"]      = feats["search_delta_sum"] / max(feats["n_events"],1)

    cart_delta = g["delta_cart_item_count"].clip(lower=0)
    feats["cart_adds"]        = float(cart_delta.sum())
    feats["cart_touch_count"] = int((cart_delta>0).sum())
    feats["cart_adds_per_event"] = feats["cart_adds"] / max(feats["n_events"],1)
    feats["search_to_cart_ratio"] = (feats["search_delta_sum"] + 1.0) / (feats["cart_adds"] + 1.0)

    first_ts = g["timestamp"].iloc[0]
    feats["first_event_hour"] = int(first_ts.hour)
    feats["is_weekend"]       = int(first_ts.weekday()>=5)

    def idx_of(pred_list, key):
        for i, s in enumerate(pred_list):
            if key in s: return i
        return None
    i_search = idx_of(states, "/search")
    i_cart   = idx_of(states, "/cart")
    feats["time_to_first_search"] = float((g["timestamp"].iloc[i_search] - first_ts).total_seconds()) if i_search is not None else -1.0
    feats["time_to_first_cart"]   = float((g["timestamp"].iloc[i_cart]   - first_ts).total_seconds()) if i_cart   is not None else -1.0

    lastk = states[-5:]
    joined_lastk = " ".join(lastk)
    feats["last5_has_search"]   = int("search" in joined_lastk)
    feats["last5_has_cart"]     = int("cart" in joined_lastk)
    feats["last5_has_checkout"] = int("checkout" in joined_lastk)

    bg_counts = Counter(list(bigrams(states)))
    for bg, idx in bigram_to_idx.items():
        feats[f"bg_{idx}"] = int(bg_counts.get(bg, 0))

    uniq = pd.Series(states).value_counts(normalize=True)
    feats["unique_states"] = int(uniq.size)
    feats["entropy_state"] = float(-(uniq*np.log(uniq+1e-12)).sum())

    st0 = states[0] if states else ""
    stL = states[-1] if states else ""
    for name, st in [("start", st0), ("last", stL)]:
        feats[f"{name}_is_root"]      = int(st == "/")
        feats[f"{name}_is_search"]    = int("/search" in st)
        feats[f"{name}_is_products"]  = int("/products" in st)
        feats[f"{name}_is_cart"]      = int("/cart" in st)
        feats[f"{name}_is_checkout"]  = int("/checkout" in st)

    all_states = " ".join(states)
    feats["hit_search"]   = int("/search" in all_states)
    feats["hit_products"] = int("/products" in all_states)
    feats["hit_cart"]     = int("/cart" in all_states)
    feats["hit_checkout"] = int("/checkout" in all_states)

    return pd.Series(feats)

print("[Done]")

# --- (교체) 전처리 & Feature Engineering 미리보기 ---
from IPython.display import display

def preview_engineered(
    n_sessions=5, K=None, drop_terminals=True, random=False, seed=42,
    include_bigrams=False, topn_bg=None
):
    """
    K: None이면 전체 세션 기준(ALL), 정수면 prefix-K 기준
    drop_terminals: True면 누수 방지용 terminal 포함 세션 제거
    random: True면 무작위 샘플, False면 앞에서부터
    include_bigrams: True면 bg_* 컬럼을 포함, False면 숨김
    topn_bg: 정수로 주면 bg_* 컬럼 중 앞에서 n개만 노출(예: topn_bg=10)
    """
    if K is None:
        df_src = df
        term_sids = set()
    else:
        df_src = prefix_by_k[K]
        term_sids = set(term_sid_by_k[K]) if drop_terminals else set()

    sids_series = df_src["session_id"].drop_duplicates()
    if random:
        sids = sids_series.sample(n=min(n_sessions, len(sids_series)), random_state=seed).tolist()
    else:
        sids = sids_series.head(n_sessions).tolist()

    # terminal 포함 세션 제거 옵션
    sids = [sid for sid in sids if sid not in term_sids]

    preview = (
        df_src[df_src["session_id"].isin(sids)]
          .groupby("session_id", sort=False)[FEATURE_COLS]
          .apply(agg_features)
          .reset_index()
          .merge(labels, on="session_id", how="left")
    )

    # bigram 컬럼 처리
    bg_cols = [c for c in preview.columns if c.startswith("bg_")]
    if not include_bigrams:
        # 전부 숨김
        preview = preview.drop(columns=bg_cols, errors="ignore")
    elif isinstance(topn_bg, int) and topn_bg >= 0:
        def _bg_idx(col): 
            try: return int(col.split("_", 1)[1])
            except: return 1_000_000
        keep_bg = [c for c in sorted(bg_cols, key=_bg_idx)[:topn_bg]]
        drop_bg = [c for c in bg_cols if c not in keep_bg]
        preview = preview.drop(columns=drop_bg, errors="ignore")

    # 보기 좋게 컬럼 정렬
    key_cols = ["session_id", "label", "n_events", "session_duration_sec", "events_per_min",
                "gap_mean", "gap_std", "gap_p95",
                "page_depth_max", "page_depth_slope", "depth_backtracks",
                "search_delta_sum", "search_rate",
                "cart_adds", "cart_touch_count", "cart_adds_per_event", "search_to_cart_ratio",
                "first_event_hour", "is_weekend",
                "time_to_first_search", "time_to_first_cart",
                "last5_has_search", "last5_has_cart", "last5_has_checkout",
                "unique_states", "entropy_state",
                "start_is_root", "start_is_search", "start_is_products", "start_is_cart", "start_is_checkout",
                "last_is_root", "last_is_search", "last_is_products", "last_is_cart", "last_is_checkout",
                "hit_search", "hit_products", "hit_cart", "hit_checkout"]
    ordered = [c for c in key_cols if c in preview.columns] + [c for c in preview.columns if c not in key_cols]
    preview = preview[ordered]

    title = "ALL" if K is None else f"prefix-K={K}"
    bg_count_total = sum(1 for c in preview.columns if c.startswith("bg_"))
    print(f"[Preview] Engineered features ({title}) — {len(preview)} rows")
    display(preview)
    print(f"  - Total columns shown: {len(preview.columns)} (bigrams shown: {bg_count_total})")
    return preview


_ = preview_engineered(n_sessions=5, K=None)

## 6) 전처리 데이터 저장
- Feature Engineering 한 데이터를 K별로 저장합니다.

In [None]:
# === Cell 6. 세션 피처 테이블 생성 & 저장 ===
X_all = (
    df.groupby("session_id", sort=False)[FEATURE_COLS]
      .apply(agg_features)
      .reset_index()
)
dataset_all = X_all.merge(labels, on="session_id", how="left")
dataset_all.to_csv(OUT_ALL, index=False)

datasets_by_k = {}
print("Saved CSVs:")
print(" -", OUT_ALL)

for K in K_LIST:
    X_prefix_k = (
        prefix_by_k[K].groupby("session_id", sort=False)[FEATURE_COLS]
                      .apply(agg_features)
                      .reset_index()
    )
    dataset_prefix_k = X_prefix_k.merge(labels, on="session_id", how="left")
    dataset_prefix_filt_k = dataset_prefix_k.reset_index(drop=True)
    datasets_by_k[K] = dataset_prefix_filt_k
    out_path_k = OUT_PREFIXF_PATH(K)
    dataset_prefix_filt_k.to_csv(out_path_k, index=False)
    print(" -", out_path_k, "| shape:", tuple(dataset_prefix_filt_k.shape))
    
print("[Done]")

## 7) 학습용 / 검증용 데이터셋 분할
- 데이터를 학습용 / 검증용 데이터셋으로 분할합니다.

In [None]:
# === Cell 7. 학습/검증 분할 (K별) ===
splits_by_k = {}
for K in K_LIST:
    dfp = datasets_by_k[K].copy()
    y = dfp["label"].astype(int)
    leak_cols = [c for c in dfp.columns if ("logout" in c.lower()) or ("delete" in c.lower())]
    X = dfp.drop(columns=["session_id","label"] + leak_cols, errors="ignore").fillna(0.0)

    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE)
    splits_by_k[K] = (X_tr, X_va, y_tr, y_va, X.columns.tolist())

    print(f"[K={K}] Shape: {X.shape}  PosRate: {float(y.mean()):.4f}")
    print(f"[K={K}] Train: {X_tr.shape}  Valid: {X_va.shape}")

print("[Done]")

## 8) 모델 선택 & 학습
- 모델을 선택한 후 학습용 데이터 셋으로 학습합니다.

In [None]:
# === Cell 8. 모델 선택 & 학습  ===
use_xgb = False
try:
    from xgboost import XGBClassifier
    use_xgb = True
except Exception as e:
    print("[warn] xgboost 불가, LogisticRegression으로 폴백:", e)

# 학습 로그 옵션
SHOW_XGB_LOG = False   # True면 부스팅 라운드별 metric을 출력(많이 나옵니다!)
USE_TQDM     = True    # True면 tqdm 프로그레스 바 사용

# tqdm 안전 임포트
try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **k): return x

# ---------- 고정 모델 빌더 ----------
def _fixed_xgb(sampler_enabled: bool, y_tr, seed=RANDOM_STATE):
    """고정 XGB 분류기(시드/epoch 반영)."""
    pos = int(y_tr.sum()); neg = int(len(y_tr) - pos)
    spw = 1.0 if sampler_enabled else (neg / max(pos, 1))
    return XGBClassifier(
        objective="binary:logistic",
        # eval_metric은 fit kwargs가 아닌 set_params에서 커스텀으로 덮습니다(조기종료 지표)
        n_estimators=XGB_EPOCHS,          # ← epoch처럼 제어
        learning_rate=0.05,
        max_depth=4,
        min_child_weight=2,
        subsample=0.85,
        colsample_bytree=0.80,
        reg_lambda=1.0,
        reg_alpha=0.1,
        n_jobs=-1,
        random_state=seed,
        scale_pos_weight=spw,
        verbosity=0,
    )

def _fixed_logreg(sampler_enabled: bool, seed=RANDOM_STATE):
    """고정 로지스틱(유사-epoch: max_iter 반영)."""
    from sklearn.linear_model import LogisticRegression
    return LogisticRegression(
        solver="saga",
        penalty="l2",
        C=1.0,
        max_iter=LR_MAX_ITER,             # ← 반복 상한
        class_weight=None if sampler_enabled else "balanced",
        n_jobs=-1,
        random_state=seed,
    )

# ---------- 커스텀 평가함수(feval) & XGB 학습 헬퍼 ----------
from sklearn.metrics import f1_score

_FEVAL_NUM_THRESHOLDS = 61  # 속도/정확도 타협: 31~101 권장

def _feval_macro_f1_balanced(y_true, y_score, sample_weight=None):
    """sklearn 래퍼 호환: (y_true, y_score[, sample_weight]) -> float"""
    ths = np.linspace(0.01, 0.99, _FEVAL_NUM_THRESHOLDS, dtype=float)
    best = 0.0
    for t in ths:
        y_hat = (y_score >= t).astype(np.int32)
        try:
            f1 = f1_score(y_true, y_hat, average="macro", sample_weight=sample_weight)
        except Exception:
            f1 = 0.0
        if f1 > best:
            best = f1
    return float(best)

def safe_fit_xgb(model, Xtr, ytr, Xva, yva):
    """Pipeline 또는 단일 모델 모두 지원. XGBClassifier에 eval_set/feval/early_stopping 전달.
       - eval_metric: 모델 파라미터(set_params)로 주입
       - early_stopping_rounds: fit kwargs
       - verbose: SHOW_XGB_LOG에 따라 on/off
    """
    is_pipe = hasattr(model, "named_steps") and ("clf" in getattr(model, "named_steps", {}))
    clf = model.named_steps["clf"] if is_pipe else model
    prefix = "clf__" if is_pipe else ""

    # (중요) 커스텀 feval은 fit kwargs가 아니라 '모델 파라미터'로 설정
    if is_pipe:
        model.set_params(**{f"{prefix}eval_metric": _feval_macro_f1_balanced})
    else:
        clf.set_params(eval_metric=_feval_macro_f1_balanced)

    # fit kwargs 구성
    fit_kwargs = {}
    sig = inspect.signature(clf.fit)

    # 검증 세트
    if "eval_set" in sig.parameters:
        fit_kwargs[f"{prefix}eval_set"] = [(Xva, yva)]

    # 조기종료 (버전 호환: callbacks 미사용)
    if "early_stopping_rounds" in sig.parameters:
        fit_kwargs[f"{prefix}early_stopping_rounds"] = EARLY_STOPPING_ROUNDS

    # 라운드 로그 출력
    if "verbose" in sig.parameters:
        fit_kwargs[f"{prefix}verbose"] = bool(SHOW_XGB_LOG)

    # 학습
    t0 = time.time()
    model.fit(Xtr, ytr, **fit_kwargs)
    dt = time.time() - t0

    # 베스트 이터레이션/점수 로깅(가능한 경우)
    est = model.named_steps["clf"] if hasattr(model, "named_steps") and "clf" in model.named_steps else model
    best_iter  = getattr(est, "best_iteration", None)
    best_score = getattr(est, "best_score", None)
    if best_iter is None:
        best_iter = getattr(est, "best_ntree_limit", None)

    print(f"    ↳ fit done in {dt:.2f}s | best_iter={best_iter} | best_score={best_score}")
    return model

# ---------- 실제 학습: 후보 모델들을 훈련만 수행 ----------
best_by_k = {}                 # 파라미터 요약(메타 저장용)
use_pipeline = _imb_ok         # sampler 사용 가능하면 파이프라인
trained_candidates_by_k = {}   # ← K별 학습된 모델 리스트(후보군). 선택/평가는 Cell 9에서.

SEP = "-" * 86  # ← K 구분선

_k_iter = tqdm(K_LIST, desc="K sweep (train)") if USE_TQDM else K_LIST

for K in _k_iter:
    print(f"\n{SEP}\n[TRAIN START] K={K}\n{SEP}")

    X_tr, X_va, y_tr, y_va, _ = splits_by_k[K]

    cfg_k = _merge_oversampling_cfg(OVERSAMPLING, OVERSAMPLING_BY_K.get(K, {}))
    sampler = _build_sampler(cfg_k)
    sampler_enabled = sampler is not None

    # 파라미터 요약(대표 시드로 1회 생성)
    base_for_summary = _fixed_xgb(sampler_enabled, y_tr, seed=RANDOM_STATE) if use_xgb else _fixed_logreg(sampler_enabled, seed=RANDOM_STATE)
    clf_params = base_for_summary.get_params(deep=False)
    summary_keys = (
        ["n_estimators", "learning_rate", "max_depth", "min_child_weight",
         "subsample", "colsample_bytree", "reg_lambda", "reg_alpha", "random_state"]
        if use_xgb else
        ["penalty", "C", "class_weight", "solver", "max_iter", "random_state"]
    )
    used_params = {k: clf_params.get(k) for k in summary_keys if k in clf_params}
    best_by_k[K] = (None, used_params)
    print(f"[K={K}] Using {'XGBClassifier' if use_xgb else 'LogisticRegression'} Params: {used_params}")

    # 후보 모델 학습
    models = []
    _seed_iter = tqdm(SEED_SEQ, desc=f"K={K} train seeds", leave=False) if USE_TQDM else SEED_SEQ
    for seed in _seed_iter:
        if use_xgb:
            base_clf = _fixed_xgb(sampler_enabled, y_tr, seed=seed)
            model_name = "XGBClassifier"
        else:
            base_clf = _fixed_logreg(sampler_enabled, seed=seed)
            model_name = "LogisticRegression"

        if use_pipeline:
            steps = []
            if sampler_enabled:
                steps.append(("sampler", sampler))
            steps.append(("clf", base_clf))
            estimator = ImbPipeline(steps)
        else:
            estimator = base_clf

        print(f"[K={K}] [Seed={seed}] model={model_name} | sampler={'ON' if sampler_enabled else 'OFF'} "
              f"| n_est={getattr(base_clf, 'n_estimators', 'NA')} | max_iter={getattr(base_clf, 'max_iter', 'NA')}")

        if use_xgb:
            estimator = safe_fit_xgb(estimator, X_tr, y_tr, X_va, y_va)
        else:
            t0 = time.time()
            estimator.fit(X_tr, y_tr)
            dt = time.time() - t0
            print(f"    ↳ fit done in {dt:.2f}s")
    
        models.append(estimator)
    
        trained_candidates_by_k[K] = models
        print(f"[TRAIN END]   K={K} | candidates={len(models)}")
        print(SEP)

print("[Done] 트레이닝 완료 — 후보 모델들은 trained_candidates_by_k[K]에 저장되었습니다.")


## 9) 모델 검증 & 평가
- 선택한 모델을 검증용 데이터셋으로 검증하고 평가합니다.

In [None]:
# === Cell 9. 검증 & 평가  ===

# 검증/평가 유틸
def balanced_score(y_true, p, t):
    y_pred = (p >= t).astype(int)
    return 0.5*(accuracy_score(y_true, y_pred) + f1_score(y_true, y_pred, average="macro"))

SHOW_05 = False

trained_models_by_k = {}   # ← 최종 선정된 단일 모델(K별)
eval_by_k = {}             # ← 평가 요약(메타/리포팅용)
T_OPER_BY_K = {}           # ← 운영 임계값(K별)
summary_rows = []
SEP = "-" * 86

for K in K_LIST:
    X_tr, X_va, y_tr, y_va, feat_names = splits_by_k[K]
    models = trained_candidates_by_k[K]  # Cell 8에서 훈련된 후보군

    # 각 후보 모델의 검증 확률
    p_list = [m.predict_proba(X_va)[:, 1] for m in models]

    # ── 임계값 탐색은 (기존 로직 유지) 앙상블 평균 확률 기준으로 수행 ──
    p_va_mean = np.mean(np.vstack(p_list), axis=0)
    ths = np.linspace(0.01, 0.99, 197)
    t_best_ensemble, s_best_ensemble = max(
        ((t, balanced_score(y_va, p_va_mean, t)) for t in ths),
        key=lambda x: x[1]
    )

    # 운영 임계값 결정(override > balanced > default)
    if isinstance(OPER_T_OVERRIDE, dict) and (K in OPER_T_OVERRIDE):
        op_t = float(OPER_T_OVERRIDE[K])
    elif OPER_POLICY == "balanced":
        op_t = float(t_best_ensemble)
    else:
        op_t = float(T_OPER_DEFAULT)
    T_OPER_BY_K[K] = op_t

    # ── 최종 단일 모델 선정: 동일 op_t에서 Macro-F1이 가장 높은 모델 ──
    single_scores_f1 = [
        f1_score(y_va, (p_list[i] >= op_t).astype(int), average="macro")
        for i in range(len(models))
    ]
    best_idx = int(np.argmax(single_scores_f1))
    final_model = models[best_idx]
    p_va_single = p_list[best_idx]

    # ── 출력/요약은 "최종 단일 모델" 기준 ──
    yhat_oper_single = (p_va_single >= op_t).astype(int)
    acc_oper_single  = accuracy_score(y_va, yhat_oper_single)
    f1_oper_single   = f1_score(y_va, yhat_oper_single, average="macro")

    if SHOW_05:
        yhat05 = (p_va_single >= 0.5).astype(int)
        print(f"[K={K}] [Valid(single) @0.50] Acc={accuracy_score(y_va, yhat05):.4f} "
              f"Macro-F1={f1_score(y_va, yhat05, average='macro'):.4f} "
              f"PR-AUC={average_precision_score(y_va, p_va_single):.4f}")
        print(confusion_matrix(y_va, yhat05))
        print(classification_report(y_va, yhat05, digits=4))

    print(f"[K={K}] [Valid(single) @t={op_t:.3f}] Acc={acc_oper_single:.4f} Macro-F1={f1_oper_single:.4f}")
    print(confusion_matrix(y_va, yhat_oper_single))
    print(classification_report(y_va, yhat_oper_single, digits=4))
    print(SEP, "\n")

    # 저장/메타용 결과 기록 (단일 모델 기준 성능을 기록)
    trained_models_by_k[K] = final_model
    eval_by_k[K] = {
        # 출력과 동일하게 단일 모델 기준 성능을 저장
        "Valid@Oper.Acc": float(acc_oper_single),
        "Valid@Oper.MacroF1": float(f1_oper_single),

        # 참고용: 임계값 선택은 앙상블 기반으로 했으므로 별도 보존
        "Balanced.t": float(t_best_ensemble),
        "Balanced.Score": float(s_best_ensemble),

        "Oper.t": float(op_t),
        "features": feat_names,

        # (신규) 재학습 정보
        "n_restarts": int(N_RESTARTS),
        "seeds": list(SEED_SEQ),
        "single_best_f1_at_oper": float(single_scores_f1[best_idx]),
        "selected_model_index": int(best_idx),
    }

    summary_rows.append({
        "K": K,
        "Oper.t": float(op_t),
        # 요약도 단일 모델 기준 값으로 출력
        "Valid@Oper.Acc": float(acc_oper_single),
        "Valid@Oper.MacroF1": float(f1_oper_single),

        # 참고용(앙상블 기반 탐색 결과)
        "Balanced.t": float(t_best_ensemble),
        "Balanced.Score": float(s_best_ensemble),
        "Restarts": int(N_RESTARTS),
    })

print("=== Summary (K별, Single Final Model on Valid) ===")
for r in sorted(summary_rows, key=lambda x: x["K"]):
    print(f"K={r['K']} | Oper.t={r['Oper.t']:.3f} | Acc@Oper={r['Valid@Oper.Acc']:.4f} "
          f"| MacroF1@Oper={r['Valid@Oper.MacroF1']:.4f} | t_bal(ens)={r['Balanced.t']:.3f} "
          f"| Score_bal(ens)={r['Balanced.Score']:.4f} | Restarts={r['Restarts']}")


## 10) 모델 저장
- K별로 모델을 저장합니다.

In [None]:
# === Cell 10. 모델 & 메타 저장 (K별) ===

def to_builtin(v):
    if isinstance(v, (np.integer,)):  return int(v)
    if isinstance(v, (np.floating,)): return float(v)
    if isinstance(v, (np.bool_,)):    return bool(v)
    return v

def _jsonable_sampling_strategy(val):
    """meta JSON에 기록할 sampling_strategy 직렬화 헬퍼."""
    if callable(val):
        return f"<callable:{getattr(val, '__name__', 'anonymous')}>"
    if isinstance(val, (np.integer, int, np.floating, float)):
        return float(val)
    if isinstance(val, (np.bool_, bool)):
        return bool(val)
    if isinstance(val, dict):
        return {str(k): to_builtin(v) for k, v in val.items()}
    return None if val is None else str(val)  # "auto"/"minority"/"not majority"/"all" 등

def _extract_selected_model_info(model):
    """파이프라인/단일 모델 공통으로 최종 선정 모델의 핵심 정보를 추출."""
    est = model.named_steps["clf"] if hasattr(model, "named_steps") and "clf" in model.named_steps else model
    info = {
        "estimator": est.__class__.__name__,
        "random_state": to_builtin(getattr(est, "random_state", None)),
        "n_estimators": to_builtin(getattr(est, "n_estimators", None)),
        "max_iter": to_builtin(getattr(est, "max_iter", None)),
        "best_iteration": to_builtin(getattr(est, "best_iteration", getattr(est, "best_ntree_limit", None))),
        "best_score": to_builtin(getattr(est, "best_score", None)),
    }
    return info

def _threshold_policy_for_k(K):
    """Cell 9의 정책을 메타로 명시."""
    if isinstance(OPER_T_OVERRIDE, dict) and (K in OPER_T_OVERRIDE):
        return {"policy": "override", "value": float(OPER_T_OVERRIDE[K])}
    elif OPER_POLICY == "balanced":
        return {"policy": "balanced", "value": float(eval_by_k[K]["Balanced.t"])}
    else:
        return {"policy": "default", "value": float(T_OPER_DEFAULT)}

def _prepare_for_dump(model):
    """XGB의 callable eval_metric이 있으면 문자열로 교체해 피클링 가능하게 만든다."""
    try:
        if hasattr(model, "named_steps") and "clf" in model.named_steps:
            est = model.named_steps["clf"]
            if hasattr(est, "eval_metric") and callable(getattr(est, "eval_metric", None)):
                # binary:logistic 기준으로 안전한 문자열 메트릭으로 교체
                model.set_params(**{"clf__eval_metric": "logloss"})
        else:
            if hasattr(model, "eval_metric") and callable(getattr(model, "eval_metric", None)):
                model.set_params(eval_metric="logloss")
    except Exception:
        # 실패해도 저장을 시도하게끔 조용히 통과
        pass
    return model

print("Saving models & meta...")
for K in K_LIST:
    # 최종 선정된 단일 모델과 평가 결과
    model = trained_models_by_k[K]
    feat_names = eval_by_k[K]["features"]
    oper_t = float(eval_by_k[K]["Oper.t"])

    # (요약용) 파라미터 스냅샷 — Cell 8에서 만든 used_params 재활용
    best_params_ = best_by_k.get(K, ({}, {}))[1] if isinstance(best_by_k.get(K), tuple) else {}
    best_params_clean = {k: to_builtin(v) for k, v in (best_params_ or {}).items()}

    # 실제 사용된(전역 + K별 override) 오버샘플링 설정을 기록
    eff_os_cfg = _merge_oversampling_cfg(OVERSAMPLING, OVERSAMPLING_BY_K.get(K, {}))
    oversampling_enabled = bool(eff_os_cfg.get("enable", False) and _imb_ok)

    # (핵심) 덤프 전에 eval_metric의 callable 제거
    model_for_dump = _prepare_for_dump(model)

    # 모델 저장
    model_path = MODEL_PATH_K(K)
    dump(model_for_dump, model_path)

    # 임계값 정책/선정 모델 정보
    th_policy = _threshold_policy_for_k(K)
    selected_info = _extract_selected_model_info(model)

    # 메타 JSON 구성
    meta = {
        "variant": (
            f"Multi-seed (prefix{K}); leakage-safe; MacroF1-earlystop(feval during train); "
            f"threshold=balanced_on_ensemble(mean)"
        ),
        "features": [str(c) for c in feat_names],
        "threshold": oper_t,
        "threshold_policy": th_policy,  # override/balanced/default + 값
        "best_params": best_params_clean,  # 요약(대표 시드 기준)
        "random_state": int(RANDOM_STATE),

        "preprocess": {
            "PREFIX_K": int(K),
            "TOPK_BIGRAMS": int(TOPK_BIGRAMS),
            "terminals": sorted(list(TERMINALS_FINAL)),
            "note": "prefix-K 내 terminal 등장 세션 제외, terminal 전이 제외 bigram 사전",
        },

        "oversampling": {
            "enabled": oversampling_enabled,
            "method": eff_os_cfg.get("method"),
            "ratio": _jsonable_sampling_strategy(eff_os_cfg.get("ratio", "auto")),
            "k_neighbors": int(eff_os_cfg.get("k_neighbors", 5)),
        },

        "artifacts_csv": {
            "all": OUT_ALL,
            "prefix_filtered": OUT_PREFIXF_PATH(K),
        },

        "validation": {
            "Acc@Oper": float(eval_by_k[K]["Valid@Oper.Acc"]),
            "MacroF1@Oper": float(eval_by_k[K]["Valid@Oper.MacroF1"]),
            "Balanced_t": float(eval_by_k[K]["Balanced.t"]),
            "Balanced_Score": float(eval_by_k[K]["Balanced.Score"]),
            "single_best_f1_at_oper": float(eval_by_k[K].get("single_best_f1_at_oper", np.nan)),
        },

        # (신규) 학습/앙상블 정보
        "training": {
            "n_restarts": int(eval_by_k[K].get("n_restarts", 1)),
            "seeds": list(eval_by_k[K].get("seeds", [RANDOM_STATE])),
            "candidate_count": len(eval_by_k[K].get("seeds", [RANDOM_STATE])),
            "ensemble": {"method": "mean", "threshold_search_grid": 197},
            "feval": {"name": "macro_f1_balanced", "num_thresholds": int(globals().get("_FEVAL_NUM_THRESHOLDS", 61))},
            "early_stopping_rounds": int(EARLY_STOPPING_ROUNDS) if use_xgb else None,
            "xgb_epochs": int(XGB_EPOCHS) if use_xgb else None,
            "lr_max_iter": int(LR_MAX_ITER) if not use_xgb else None,
            "selected_model": selected_info,  # 최종 저장된 단일 모델 정보
        },
    }

    meta_path = META_PATH_K(K)
    with open(meta_path, "w") as f:
        json.dump(meta, f, indent=2)

    print(f" - Saved model: {model_path}")
    print(f" - Saved meta : {meta_path}")


## 11) 세션 이탈 추론 결과 값 시각화
- 검증한 모델을 평가한 결과 값을 그래프로 시각화합니다.

In [None]:
# === Cell 11. 시각화 (K별 최종 단일 모델 @운영 임계값) ===

import numpy as np  # <<< 추가: bar 차트용

LABELS = [0, 1]
LABEL_NAMES = {0: "0 = Drop", 1: "1 = Complete"}

def _prob_single_for_k(K, X_va):
    """K에 대해 '최종 단일 모델'의 확률 벡터와 표시용 문자열을 반환."""
    model = trained_models_by_k[K]
    # 표시용 모델명/시드
    est = model.named_steps["clf"] if hasattr(model, "named_steps") and "clf" in model.named_steps else model
    model_name = est.__class__.__name__
    sel_idx = eval_by_k[K].get("selected_model_index", None)
    seeds = eval_by_k[K].get("seeds", None)
    if sel_idx is not None and seeds and 0 <= sel_idx < len(seeds):
        model_str = f"{model_name} (seed={seeds[sel_idx]})"
    else:
        model_str = model_name

    # 확률 예측
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(X_va)[:, 1]
    else:
        # 예외적 상황 대비: decision_function만 있을 경우 시그모이드로 변환
        z = model.decision_function(X_va)
        p = 1.0 / (1.0 + np.exp(-z))
    return p, model_str

# <<< 추가: K별 성능 수집용 컨테이너
ks, accs, f1s = [], [], []

for K in K_LIST:
    # 검증 세트 / 임계값
    X_tr, X_va, y_tr, y_va, _ = splits_by_k[K]
    t_oper = T_OPER_BY_K.get(K, eval_by_k[K]["Oper.t"])
    t_bal  = eval_by_k[K].get("Balanced.t", None)
    t_label = (
        f"@t={t_oper:.3f}"
        if (t_bal is None or abs(t_oper - t_bal) < 1e-12)
        else f"@t={t_oper:.3f} (Balanced.t={t_bal:.3f})"
    )

    # 단일 최종 모델 확률 및 예측
    p, model_str = _prob_single_for_k(K, X_va)
    yhat = (p >= t_oper).astype(int)

    # 혼동행렬 및 지표
    cm = confusion_matrix(y_va, yhat, labels=LABELS)
    row_sums = cm.sum(axis=1, keepdims=True).astype(float)
    cm_norm = np.divide(cm, row_sums, out=np.zeros_like(cm, dtype=float), where=row_sums != 0)

    acc = accuracy_score(y_va, yhat)
    macro_f1 = f1_score(y_va, yhat, average="macro")
    n_va = len(y_va)

    # <<< 추가: 수집
    ks.append(K)
    accs.append(acc)
    f1s.append(macro_f1)

    # 플롯(혼동행렬)
    fig, ax = plt.subplots(figsize=(6.2, 5.2))

    for i in range(2):
        for j in range(2):
            correct = (i == j)
            ax.add_patch(plt.Rectangle(
                (j - 0.5, i - 0.5), 1, 1,
                facecolor=("green" if correct else "red"),
                edgecolor="black", alpha=0.30
            ))
            ax.text(
                j, i,
                f"{cm[i, j]}\n({cm_norm[i, j]:.2f})",
                ha="center", va="center", fontsize=12, fontweight="bold"
            )

    ax.set_xticks([0, 1]); ax.set_xticklabels([LABEL_NAMES[0], LABEL_NAMES[1]], rotation=15, ha="right")
    ax.set_yticks([0, 1]); ax.set_yticklabels([LABEL_NAMES[0], LABEL_NAMES[1]])
    ax.set_xlim(-0.5, 1.5); ax.set_ylim(1.5, -0.5)
    ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")
    ax.set_title(f"K={K} | {t_label} — Acc={acc:.4f}, Macro-F1={macro_f1:.4f}")

    legend_elems = [
        Patch(facecolor="green", edgecolor="black", label="Correct (TP/TN)"),
        Patch(facecolor="red", edgecolor="black", label="Incorrect (FP/FN)")
    ]
    ax.legend(handles=legend_elems, loc="upper right", frameon=True)

    plt.tight_layout()
    plt.show()

# === (추가) K별 ACC vs Macro-F1 막대그래프 ===
# X축은 K (예: 3,5,7). ACC=파랑, F1=초록.
# 필요 시 K 정렬
order = np.argsort(ks)
ks_sorted  = [ks[i]  for i in order]
accs_sorted = [accs[i] for i in order]
f1s_sorted  = [f1s[i] for i in order]

x = np.arange(len(ks_sorted))
width = 0.36

fig, ax = plt.subplots(figsize=(7.2, 4.6))
bars_acc = ax.bar(x - width/2, accs_sorted, width, label="ACC", color="blue")     # 파란 막대
bars_f1  = ax.bar(x + width/2, f1s_sorted, width, label="Macro-F1", color="green")# 초록 막대

ax.set_xticks(x)
ax.set_xticklabels(ks_sorted)  # 예: 3,5,7
ax.set_xlabel("K")
ax.set_ylabel("Score")
ax.set_title("Score (ACC & Macro-F1)")
ax.set_ylim(0, 1.0)
ax.grid(axis="y", linestyle="--", alpha=0.35)
ax.legend(loc="best")

# 값 라벨 표시
ax.bar_label(bars_acc, fmt="%.4f", padding=3)
ax.bar_label(bars_f1,  fmt="%.4f", padding=3)

plt.tight_layout()
plt.show()


## 12) 추론 테스트
- 검증된 모델에 샘플 데이터를 입력하여 추론 테스트를 진행합니다.

In [None]:
# === Inference Cell (K=7): paste TSV → load model → align features → predict ===
import os, json, io, numpy as np, pandas as pd
from joblib import load
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ---- Config ----
K = 7
MODEL_DIR = "models/sessionDrop"
MODEL_PATH = os.path.join(MODEL_DIR, f"sessionDrop_model.k{K}.joblib")
META_PATH  = os.path.join(MODEL_DIR, f"sessionDrop_model.k{K}.meta.json")

# ---- Helpers ----
def _sigmoid(z):
    z = np.asarray(z, dtype=float)
    return 1.0 / (1.0 + np.exp(-z))

def _load_threshold(default=0.5):
    # 1) meta.json → threshold, 2) eval_by_k[K]["Oper.t"] (메모리에 있다면), 3) default
    try:
        with open(META_PATH, "r") as f:
            meta = json.load(f)
        return float(meta.get("threshold", default))
    except Exception:
        pass
    try:
        return float(eval_by_k[K]["Oper.t"])  # Notebook 메모리에 남아있을 수 있음
    except Exception:
        return float(default)

def _load_feature_list():
    # meta.json의 학습 시 feature 순서를 되살림(빅그램 포함)
    try:
        with open(META_PATH, "r") as f:
            meta = json.load(f)
        feats = meta.get("features", None)
        if isinstance(feats, list) and len(feats) > 0:
            return [str(c) for c in feats]
    except Exception:
        pass
    try:
        return list(eval_by_k[K]["features"])  # 메모리 보조
    except Exception:
        return None

def _ensure_numeric(df):
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == "O":
            out[c] = pd.to_numeric(out[c], errors="coerce")
    return out.fillna(0.0)

# ---- Paste your TSV here (session_id \t <features...> [\t label]) ----
RAW_TSV = """0012eec7-d6c0-4eff-b4c4-727e03e46b0c	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	17.0	0.0	-1.0	-1.0	0.0	0.0	0.0	0.0	0.0	2.0	1.0	0.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3.0	0.9556998911095345	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
002071d2-83a3-4cc1-ac33-b6a98353cd13	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	-1.0	-1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5.0	1.4750763110496952	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1
a6d0ab18-6102-42d5-ad36-5c02efec6caf	7.0	60.0	7.0	8.571428571428571	20.99562636671296	41.99999999999996	7.0	1.0	0.0	1.0	0.14285714285714285	0.0	0.0	0.0	2.0	3.0	0.0	60.0	-1.0	1.0	0.0	0.0	1.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0	1.1537419426970903	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0
010f7eb5-8c35-4e69-bd9d-c0dacef95068	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	16.0	0.0	-1.0	-1.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3.0	1.0042424730510766	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1
c8d03998-8f4e-4c89-85c9-da22f5bc0e64	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	-1.0	-1.0	0.0	0.0	0.0	2.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	5.0	1.5498260458732018	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0
c5b8c2db-e7c8-436e-96fd-da273dd148b7	7.0	0.0	7000000000.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	17.0	0.0	-1.0	-1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	0.41011631828640904	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
51298911-140e-4357-8b66-f9acf0217700	7.0	0.0	7000000000.0	0.0	0.0	0.0	4.0	0.5	0.0	0.0	0.0	0.0	0.0	0.0	1.0	6.0	0.0	-1.0	-1.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3.0	1.0789922078745835	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0
504045be-f550-4bd7-bae4-0b736fc61dfb	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	16.0	0.0	-1.0	-1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5.0	1.5498260458732018	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1
5032cacd-9fc5-4bc5-90b1-9ac4e6bd3416	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	-1.0	-1.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	6.0	1.7478680974607577	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1
fffe1852-ab15-4b57-8aa3-cd0fb92c3ccc	7.0	0.0	7000000000.0	0.0	0.0	0.0	7.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	17.0	0.0	-1.0	-1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0	1.1537419426970903	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1
"""

# ---- Read pasted data (whitespace-separated tolerant) ----
df_raw = pd.read_csv(io.StringIO(RAW_TSV.strip()), sep=r"\s+", header=None, engine="python")

# ---- Load feature order & model ----
feature_list = _load_feature_list()
if feature_list is None:
    raise RuntimeError("학습 시 feature 목록(features)을 찾지 못했습니다. Cell 10에서 생성된 meta JSON이 있는지 확인하세요.")

expected_no_label = 1 + len(feature_list)     # session_id + features
expected_with_label = expected_no_label + 1   # + label

if df_raw.shape[1] not in (expected_no_label, expected_with_label):
    raise RuntimeError(
        f"열 수가 예상과 다릅니다. got={df_raw.shape[1]} expected={expected_no_label} or {expected_with_label} "
        f"(features={len(feature_list)})"
    )

columns = ["session_id"] + feature_list
has_label = (df_raw.shape[1] == expected_with_label)
if has_label:
    columns = columns + ["label"]
df_raw.columns = columns

# keep reference label if exists; drop from X
y_ref = None
if has_label:
    y_ref = pd.to_numeric(df_raw["label"], errors="coerce").fillna(0).astype(int)
X = _ensure_numeric(df_raw[feature_list])

# load model
loaded_from = None
try:
    model = load(MODEL_PATH)
    loaded_from = f"[file] {MODEL_PATH}"
except Exception:
    # fallback to memory (if you already trained in this kernel)
    try:
        model = trained_models_by_k[K]
        loaded_from = "[memory] trained_models_by_k"
    except Exception as e:
        raise RuntimeError(f"모델을 찾을 수 없습니다. '{MODEL_PATH}'가 존재하거나, 먼저 학습 셀(Cell 8~10)을 실행해 주세요. err={e}")

t_oper = _load_threshold(default=0.5)

print(f"[info] Using K={K} model from {loaded_from}")
print(f"[info] threshold (Oper.t) = {t_oper:.3f}")
print(f"[info] samples = {len(X)}, features = {X.shape[1]}")

# ---- Predict ----
if hasattr(model, "predict_proba"):
    proba = model.predict_proba(X)[:, 1]
else:
    z = model.decision_function(X)
    proba = _sigmoid(z)
pred = (proba >= t_oper).astype(int)

# ---- Display with rows + clear separators ----
from IPython.display import display, HTML

if y_ref is not None:
    truth_vals = y_ref.values
    mark = np.where(pred == truth_vals, "✓", "✗")
else:
    truth_vals = ["-"] * len(pred)
    mark = [""] * len(pred)

headers = ["case", "truth", "pred", "✓/✗", "note"]
rows = [
    [df_raw.get("session_id", pd.Series(range(len(pred)))).astype(str).iloc[i],
     int(truth_vals[i]) if truth_vals[i] in (0,1) else truth_vals[i],
     int(pred[i]),
     mark[i],
     ""]
    for i in range(len(pred))
]

df_view = pd.DataFrame(rows, columns=headers)

display(HTML("<hr style='margin:6px 0;'>"))
display(HTML("<h4 style='margin:4px 0;'>Inference Result (K=7)</h4>"))
display(df_view)

if y_ref is not None:
    acc = accuracy_score(truth_vals, pred)
    f1m = f1_score(truth_vals, pred, average="macro")
    display(HTML(f"<div><b>[Done]</b> 정확도(샘플 {len(truth_vals)}개): {acc:.3f}"))

