# imports + seed


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from catboost import CatBoostClassifier
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
)

RANDOM_STATE = 719
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


In [None]:
df = pd.read_parquet('kkbox_train_feature_v1.parquet')


In [None]:
df.head(5)


In [None]:
# 1) bd vs bd_clean 중복 제거
if "bd" in df.columns and "bd_clean" in df.columns:
    df = df.drop(columns=["bd"])

# 2) time / month 숫자화
df["reg_year"]  = df["registration_init_time"].dt.year.astype("Int64")

# 원본 제거 (Period/Datetime 에러 방지)
df = df.drop(columns=["registration_init_time", "registration_month"])


# train/valid/test split (stratify 유지)


In [None]:
assert "msno" in df.columns and "is_churn" in df.columns

trainval_df, test_df = train_test_split(
    df,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=df["is_churn"],
)

valid_size = 0.15 / 0.85
train_df, valid_df = train_test_split(
    trainval_df,
    test_size=valid_size,
    random_state=RANDOM_STATE,
    stratify=trainval_df["is_churn"],
)

feature_cols = [c for c in df.columns if c not in ["msno", "is_churn"]]

X_train, y_train = train_df[feature_cols], train_df["is_churn"].astype(int)
X_valid, y_valid = valid_df[feature_cols], valid_df["is_churn"].astype(int)
X_test,  y_test  = test_df[feature_cols],  test_df["is_churn"].astype(int)

print("churn rate:", y_train.mean(), y_valid.mean(), y_test.mean())
print(X_train.shape, X_valid.shape, X_test.shape)


# column split + preprocess (OHE)


In [None]:
from pandas.api.types import is_numeric_dtype

cat_cols = [c for c in X_train.columns if not is_numeric_dtype(X_train[c])]
num_cols = [c for c in X_train.columns if c not in cat_cols]

print("num:", len(num_cols), "cat:", len(cat_cols))
print("cat example:", cat_cols[:10])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
        ]), cat_cols),
    ],
    remainder="drop",
)


# 공통 평가 함수


In [None]:
def eval_binary(y_true, p_pred, prefix="", thr=0.5):
    roc = roc_auc_score(y_true, p_pred)
    ap  = average_precision_score(y_true, p_pred)

    y_hat = (p_pred >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat)
    cr = classification_report(y_true, y_hat, digits=4)

    print(f"{prefix}ROC-AUC: {roc:.6f} | PR-AUC(AP): {ap:.6f} | thr={thr}")
    print(f"{prefix}Confusion matrix:\n{cm}")
    print(f"{prefix}Classification report:\n{cr}")
    return roc, ap, cm


# CatBoost 학습

- loss_function: "Logloss"
- eval_metric: "AUC"
- scale_pos_weight로 불균형 대응 (Recall 최적화)
- early_stopping_rounds=50 적용
- thread_count=-1로 풀코어


In [None]:
# scale_pos_weight 계산 (Recall 최적화)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# 전처리 적용
X_train_prep = preprocess.fit_transform(X_train)
X_valid_prep = preprocess.transform(X_valid)
X_test_prep = preprocess.transform(X_test)

print(f"Preprocessed shapes: {X_train_prep.shape}, {X_valid_prep.shape}, {X_test_prep.shape}")


In [None]:
cb_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    iterations=500,
    early_stopping_rounds=50,
    scale_pos_weight=float(scale_pos_weight),
    random_seed=RANDOM_STATE,
    thread_count=-1,
    verbose=100,
)

cb_model.fit(
    X_train_prep,
    y_train,
    eval_set=(X_valid_prep, y_valid),
    use_best_model=True,
)

print(f"\nBest iteration: {cb_model.best_iteration_}")


In [None]:
# 예측 및 평가
p_valid = cb_model.predict_proba(X_valid_prep)[:, 1]
p_test  = cb_model.predict_proba(X_test_prep)[:, 1]

print("=" * 60)
print("Threshold = 0.5")
print("=" * 60)
eval_binary(y_valid, p_valid, prefix="[CB valid] ")
eval_binary(y_test,  p_test,  prefix="[CB test ] ")


# Threshold 최적화 (Recall 우선)


In [None]:
from sklearn.metrics import precision_recall_curve, recall_score, precision_score, f1_score

def find_optimal_threshold_for_recall(y_true, y_prob, target_recall=0.95):
    """목표 Recall을 달성하는 최적 Threshold 찾기"""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_prob)
    
    # target_recall 이상인 threshold 중 가장 높은 precision
    valid_idx = np.where(recalls >= target_recall)[0]
    if len(valid_idx) == 0:
        best_idx = np.argmax(recalls)
    else:
        best_idx = valid_idx[np.argmax(precisions[valid_idx])]
    
    if best_idx >= len(thresholds):
        best_idx = len(thresholds) - 1
    
    return thresholds[best_idx]

# 목표: Recall 95%
optimal_thr = find_optimal_threshold_for_recall(y_valid, p_valid, target_recall=0.95)
print(f"Optimal threshold for Recall >= 95%: {optimal_thr:.4f}")

print("\n" + "=" * 60)
print(f"Threshold = {optimal_thr:.2f} (Recall 최적화)")
print("=" * 60)
eval_binary(y_valid, p_valid, prefix="[CB valid] ", thr=optimal_thr)
eval_binary(y_test,  p_test,  prefix="[CB test ] ", thr=optimal_thr)


# Feature importance


In [None]:
# preprocess의 출력 피처명 얻기
feature_names = preprocess.get_feature_names_out()

importances = cb_model.get_feature_importance()
imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

imp_df.head(30)


In [None]:
imp_df.to_csv(
    "../data/model_df/cb_feature_importance_split_preprocessed.csv",
    index=False
)


# Permutation importance


In [None]:
import os
os.environ["JOBLIB_TEMP_FOLDER"] = r"C:\joblib_tmp"


In [None]:
from sklearn.inspection import permutation_importance

# 너무 느리면 valid에서 일부만 샘플링
sample_n = 30000
if X_valid_prep.shape[0] > sample_n:
    idx = np.random.choice(X_valid_prep.shape[0], size=sample_n, replace=False)
    Xv = X_valid_prep[idx]
    yv = y_valid.iloc[idx]
else:
    Xv, yv = X_valid_prep, y_valid

perm = permutation_importance(
    cb_model,
    Xv, yv,
    n_repeats=5,
    random_state=RANDOM_STATE,
    scoring="roc_auc",
    n_jobs=1
)


In [None]:
perm_df = pd.DataFrame({
    "feature": feature_names,
    "perm_importance_mean": perm.importances_mean,
    "perm_importance_std": perm.importances_std,
}).sort_values("perm_importance_mean", ascending=False)

perm_df.head(30)


In [None]:
perm_df.to_csv("../data/model_df/cb_perm_importance.csv", index=False)


# 모델 저장


In [None]:
from pathlib import Path
import joblib

# 모델 저장 디렉토리
out_dir = Path(r"C:\artifacts")
out_dir.mkdir(parents=True, exist_ok=True)

# CatBoost 모델 저장 (.cbm 형식)
cb_model_path = out_dir / "catboost_model.cbm"
cb_model.save_model(str(cb_model_path))
print(f"CatBoost model saved: {cb_model_path}")

# 전처리 파이프라인 저장
prep_path = out_dir / "catboost_preprocessor.joblib"
joblib.dump(preprocess, prep_path, compress=3)
print(f"Preprocessor saved: {prep_path}")


In [None]:
# 로드 예시
# from catboost import CatBoostClassifier
# import joblib
# 
# cb_model = CatBoostClassifier()
# cb_model.load_model(r"C:\artifacts\catboost_model.cbm")
# preprocess = joblib.load(r"C:\artifacts\catboost_preprocessor.joblib")
# 
# X_new_prep = preprocess.transform(X_new)
# p = cb_model.predict_proba(X_new_prep)[:, 1]
