# CatBoost Hyperparameter Tuning (Optuna)

이 노트북은 전처리된 데이터를 기반으로 CatBoost 모델의 하이퍼파라미터를 튜닝하고 최적의 모델을 저장합니다.

**목표**:
- Optuna를 사용한 하이퍼파라미터 최적화
- 최적화 지표: **F1-Score** (User Request)
- Best Model 저장 및 메타데이터 기록

In [10]:
import numpy as np
import pandas as pd
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import json
import os

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 100)

## 1. Load Data

In [11]:
# 데이터 로드
train_path = '../data/processed/kkbox_train_feature_v4.parquet'
df = pd.read_parquet(train_path)

print(df.shape)

(860966, 92)


In [12]:
RANDOM_STATE = 719
ID_COL = "msno"
TARGET_COL = "is_churn"

# 범주형 및 수치형 컬럼 정의 (이전 노트북 참조)
CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled",
    "is_auto_renew_last",
    "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    # w7
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    # w14
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    # w21
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    # w30 (Excluded)
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    # trends
    "days_trend_w7_w14",
    "secs_trend_w7_w30", "secs_trend_w14_w30",
    "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30",
    "skip_trend_w7_w30", "completion_trend_w7_w30",
    # transactions
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est",
    "payment_count_last_30d", "payment_count_last_90d",
    # V4 New Derived Features
    "active_decay_rate",
    "listening_time_velocity",
    "discovery_index",
    "skip_passion_index",
    "last_active_gap"
]

# 실제 데이터프레임에 존재하는 컬럼만 필터링
cat_cols = [c for c in CATEGORICAL_COLS if c in df.columns]
num_cols = [c for c in NUMERICAL_COLS if c in df.columns]
FEATURE_COLS = cat_cols + num_cols

print(f"Selected Features: {len(FEATURE_COLS)}")
print(f"Categorical: {len(cat_cols)}")
print(f"Numerical: {len(num_cols)}")

Selected Features: 88
Categorical: 8
Numerical: 80


## 2. Preprocessing & Split

In [13]:
# CatBoost 처리를 위해 범주형 변수를 문자열로 변환
for col in cat_cols:
    df[col] = df[col].astype(str).astype("category")

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Train shape: {X_train.shape}")
print(f"Valid shape: {X_valid.shape}")

Train shape: (688772, 88)
Valid shape: (172194, 88)


## 3. Optuna Hyperparameter Tuning

In [14]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 1000, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "auto_class_weights": "Balanced",  # 불균형 데이터 처리
        "loss_function": "Logloss",
        "eval_metric": "F1",  # Optuna 내부 평가용 (CatBoost metric)
        "random_seed": RANDOM_STATE,
        "verbose": False,
        "early_stopping_rounds": 100,
    }

    model = CatBoostClassifier(**params)
    
    model.fit(
        X_train, y_train,
        cat_features=cat_cols,
        eval_set=(X_valid, y_valid),
        verbose=False
    )
    
    # F1 Score 계산 (Threshold 0.5 기준, 필요 시 튜닝 가능하지만 일단 0.5 고정)
    y_pred = model.predict(X_valid)
    score = f1_score(y_valid, y_pred)
    
    return score


In [15]:
# Optuna Study 실행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)  # 시간 관계상 20회 시도 (필요 시 늘릴 것)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-12-21 06:22:55,045] A new study created in memory with name: no-name-d16bd586-83ea-475a-8c1a-5c0cdd836d17
[I 2025-12-21 06:25:09,302] Trial 0 finished with value: 0.7983366895938488 and parameters: {'iterations': 2114, 'learning_rate': 0.013856872052279152, 'depth': 5, 'l2_leaf_reg': 2.6759351522412134, 'border_count': 192, 'random_strength': 3.191775654000403e-06, 'bagging_temperature': 0.9707390406500667}. Best is trial 0 with value: 0.7983366895938488.
[I 2025-12-21 06:27:00,992] Trial 1 finished with value: 0.7909036753088659 and parameters: {'iterations': 2357, 'learning_rate': 0.012709389225120807, 'depth': 4, 'l2_leaf_reg': 4.304304094310822, 'border_count': 160, 'random_strength': 2.814513498196936e-06, 'bagging_temperature': 0.03435881846082345}. Best is trial 0 with value: 0.7983366895938488.
[I 2025-12-21 06:27:49,929] Trial 2 finished with value: 0.8023332194750257 and parameters: {'iterations': 1627, 'learning_rate': 0.09317293864262122, 'depth': 5, 'l2_leaf_reg': 

Best trial:
  Value: 0.8024525670377095
  Params: 
    iterations: 2828
    learning_rate: 0.0050026515194664485
    depth: 8
    l2_leaf_reg: 1.0188436521315694
    border_count: 134
    random_strength: 0.0045044770810892835
    bagging_temperature: 0.45756810496988165


## 4. Train Best Model & Evaluation

In [16]:
best_params = study.best_params.copy()

# 고정 파라미터 추가
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "auto_class_weights": "Balanced",
    "random_seed": RANDOM_STATE,
    "verbose": 100,
    "early_stopping_rounds": 100,
})

final_model = CatBoostClassifier(**best_params)

final_model.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_valid, y_valid)
)

0:	learn: 0.9289076	test: 0.9271706	best: 0.9271706 (0)	total: 212ms	remaining: 9m 59s
100:	learn: 0.9317958	test: 0.9305034	best: 0.9305589 (36)	total: 17.2s	remaining: 7m 45s
200:	learn: 0.9337117	test: 0.9334254	best: 0.9334285 (199)	total: 34.5s	remaining: 7m 30s
300:	learn: 0.9347919	test: 0.9345586	best: 0.9346075 (282)	total: 50.7s	remaining: 7m 5s
400:	learn: 0.9359643	test: 0.9358166	best: 0.9358585 (398)	total: 1m 6s	remaining: 6m 45s
500:	learn: 0.9422506	test: 0.9424087	best: 0.9424378 (498)	total: 1m 22s	remaining: 6m 25s
600:	learn: 0.9448261	test: 0.9452615	best: 0.9452707 (598)	total: 1m 40s	remaining: 6m 13s
700:	learn: 0.9451794	test: 0.9457115	best: 0.9457238 (698)	total: 1m 57s	remaining: 5m 56s
800:	learn: 0.9455066	test: 0.9460654	best: 0.9461074 (757)	total: 2m 14s	remaining: 5m 40s
900:	learn: 0.9459672	test: 0.9465675	best: 0.9465737 (898)	total: 2m 31s	remaining: 5m 24s
1000:	learn: 0.9462652	test: 0.9464061	best: 0.9466544 (913)	total: 2m 48s	remaining: 5m 6s

<catboost.core.CatBoostClassifier at 0x148473da0>

In [17]:
y_pred = final_model.predict(X_valid)
y_prob = final_model.predict_proba(X_valid)[:, 1]

print("Final F1 Score:", f1_score(y_valid, y_pred))
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

cm = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", cm)

Final F1 Score: 0.8024525670377095

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98    155904
           1       0.70      0.94      0.80     16290

    accuracy                           0.96    172194
   macro avg       0.85      0.95      0.89    172194
weighted avg       0.97      0.96      0.96    172194

Confusion Matrix:
 [[149440   6464]
 [  1043  15247]]


## 5. Save Model & Metadata

In [18]:
MODEL_DIR = "../03_trained_model"
os.makedirs(MODEL_DIR, exist_ok=True)

model_path = os.path.join(MODEL_DIR, "catboost_model.cbm")
final_model.save_model(model_path)
print(f"Model saved to {model_path}")

# 메타데이터 저장
metadata = {
    "best_params": best_params,
    "best_f1_score": float(study.best_value),
    "feature_names": FEATURE_COLS,
    "categorical_features": cat_cols,
    "numerical_features": num_cols
}

metadata_path = os.path.join(MODEL_DIR, "model_metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)
print(f"Metadata saved to {metadata_path}")

Model saved to ../03_trained_model/catboost_model.cbm
Metadata saved to ../03_trained_model/model_metadata.json


In [19]:
# Feature Importance 저장
fi_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": final_model.get_feature_importance()
}).sort_values(by="importance", ascending=False)

fi_path = os.path.join(MODEL_DIR, "feature_importance.csv")
fi_df.to_csv(fi_path, index=False)
print(f"Feature importance saved to {fi_path}")

Feature importance saved to ../03_trained_model/feature_importance.csv
