## 1. 환경 설정 및 라이브러리 Import

In [1]:
!pip install optuna catboost

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, catboost
Successfully installed catboost-1.2.8 colorlog-6.9.0 optuna-4.5.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 데이터 불러오기
- 원본 데이터가 아닌 사전에 전처리해둔 데이터셋 로드

In [7]:
# -*- coding: utf-8 -*-
# =========================================================
# CTR 예측 — 통합 피처엔지니어링 + Optuna CatBoost K-Fold
# (심야=자정~05시, 누수방지 KFold Target Encoding 포함)
# =========================================================

# (선택) Colab Drive mount


# 기본
import os, gc, random, warnings, pickle, itertools
warnings.filterwarnings('ignore')
from datetime import datetime

# 데이터/수학
import numpy as np
import pandas as pd

# 모델/검증/메트릭
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import average_precision_score
from catboost import CatBoostClassifier
import optuna

# ----------------------------------
# 설정
# ----------------------------------
CFG = {
    'SEED': 42,
    'N_FOLDS': 5,
    'CATBOOST_ITERATIONS': 3000,
    'CATBOOST_EARLY_STOPPING': 100,
    'N_TRIALS': 40,                 # Optuna 탐색 횟수 (필요시 늘리기)
    'DOWNSAMPLE_RATIO': 2,          # clicked=1 대비 0의 배수 (2= 1:2)
}

random.seed(CFG['SEED'])
np.random.seed(CFG['SEED'])
os.environ['PYTHONHASHSEED'] = str(CFG['SEED'])

# 경로
TRAIN_PATH = "/content/drive/MyDrive/open/train.parquet"
TEST_PATH  = "/content/drive/MyDrive/open/test.parquet"
SAMPLE_SUB = "/content/drive/MyDrive/open/sample_submission.csv"
FEATURE_SAVE_DIR = "/content/drive/Othercomputers/내 Mac/Python/데이콘/현재 진행중/토스/outputs_test/features_postTE"
os.makedirs(FEATURE_SAVE_DIR, exist_ok=True)

train_features_pkl = os.path.join(FEATURE_SAVE_DIR, "train_features_2.pkl")
test_features_pkl  = os.path.join(FEATURE_SAVE_DIR, "test_features_2.pkl")

target_col = "clicked"
seq_col = "seq"

# # =========================================================
# # 1) 데이터 로드 & 다운샘플 (불균형 완화)
# # =========================================================
# all_train = pd.read_parquet(TRAIN_PATH, engine="pyarrow")
# test_base = pd.read_parquet(TEST_PATH, engine="pyarrow").drop(columns=['ID'])

# print("Base Train:", all_train.shape, " Test:", test_base.shape)

# clicked_1 = all_train[all_train[target_col] == 1]
# clicked_0 = all_train[all_train[target_col] == 0].sample(
#     n=len(clicked_1) * CFG['DOWNSAMPLE_RATIO'], random_state=CFG['SEED']
# )
# train_base = pd.concat([clicked_1, clicked_0], axis=0).sample(frac=1, random_state=CFG['SEED']).reset_index(drop=True)
# del clicked_1, clicked_0; gc.collect()

# print("Downsampled Train:", train_base.shape,
#       "| clicked=0:", (train_base[target_col]==0).sum(),
#       "| clicked=1:", (train_base[target_col]==1).sum())


# =========================================================
# 8) 평가지표: 0.5*AP + 0.5*(1 - WLL)
# =========================================================
def calculate_competition_score(y_true, y_pred_proba):
    ap = average_precision_score(y_true, y_pred_proba)
    pos_weight = len(y_true) / (2 * np.sum(y_true))
    neg_weight = len(y_true) / (2 * (len(y_true) - np.sum(y_true)))
    ypp = np.clip(y_pred_proba, 1e-7, 1-1e-7)
    log_loss_pos = -np.mean(y_true * np.log(ypp)) * pos_weight
    log_loss_neg = -np.mean((1 - y_true) * np.log(1 - ypp)) * neg_weight
    wll = log_loss_pos + log_loss_neg
    final_score = 0.5 * ap + 0.5 * (1 - wll)
    return {'ap': ap, 'wll': wll, 'final_score': final_score}

# =========================================================
# 9) (Revised) 저장한 PKL 피처 로드만 해서 진행
# =========================================================
print("\n=== Load prebuilt features (PKL) ===")
if not (os.path.exists(train_features_pkl) and os.path.exists(test_features_pkl)):
    raise FileNotFoundError(
        f"PKL이 없습니다.\n- {train_features_pkl}\n- {test_features_pkl}\n"
        "먼저 피처를 생성/저장한 뒤 다시 실행하세요."
    )

with open(train_features_pkl, "rb") as f:
    train_features = pickle.load(f)
with open(test_features_pkl, "rb") as f:
    test_features = pickle.load(f)

print(f"Loaded train_features: {train_features.shape} | test_features: {test_features.shape}")

# CatBoost: 카테고리형 인덱스 (pandas category dtype 기준)
target_col = "clicked"  # 혹시 위에서 바뀌었을 수 있으니 다시 명시
feature_cols = [c for c in train_features.columns if c != target_col]
cat_feature_indices = [
    i for i, col in enumerate(feature_cols)
    if str(train_features[col].dtype) == "category"
]
print("Cat features (idx):", cat_feature_indices)




=== Load prebuilt features (PKL) ===
Loaded train_features: (612537, 180) | test_features: (1527298, 180)
Cat features (idx): [154, 155, 158, 159, 160, 161, 162, 163]


## Optuna 학습함수 정의

In [4]:
def catboost_kfold_optuna(
    train_df: pd.DataFrame,
    feature_cols: list,
    target_col: str,
    cat_feature_indices: list = None,
    n_folds: int = 5,
    n_trials: int = 40,
    use_gpu: bool = True
):
    X = train_df[feature_cols]
    y = train_df[target_col].astype(int).values

    task_type = "GPU" if use_gpu else "CPU"

    # ----- Optuna objective (bootstrap_type 고정: Bernoulli) -----
    def objective(trial):
        params = {
            "iterations": CFG['CATBOOST_ITERATIONS'],
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "depth": trial.suggest_int("depth", 6, 12),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
            "border_count": trial.suggest_int("border_count", 64, 255),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 50),
            "random_strength": trial.suggest_float("random_strength", 0.5, 5.0),
            "eval_metric": "Logloss",
            "early_stopping_rounds": CFG['CATBOOST_EARLY_STOPPING'],
            "verbose": 0,
            "auto_class_weights": "Balanced",
            "random_seed": CFG['SEED'],
            "task_type": task_type,

            # ★ 고정
            "bootstrap_type": "Bernoulli",
            # Bernoulli일 때만 쓰이는 subsample은 튜닝 유지(고정 원하면 아래 한 줄을 고정값으로 바꿔도 됨)
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }

        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=CFG['SEED'])
        scores = []
        for tr_idx, va_idx in skf.split(X, y):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y[tr_idx], y[va_idx]

            model = CatBoostClassifier(**params)
            if cat_feature_indices:
                model.fit(X_tr, y_tr, eval_set=(X_va, y_va), cat_features=cat_feature_indices, verbose=0)
            else:
                model.fit(X_tr, y_tr, eval_set=(X_va, y_va), verbose=0)

            pred = model.predict_proba(X_va)[:, 1]
            m = calculate_competition_score(y_va, pred)
            scores.append(m['final_score'])

        return float(np.mean(scores))

    # ----- Optuna 실행 -----
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("\n=== Optuna Best Params ===")
    print(study.best_trial.params)
    print(f"Best CV Final Score: {study.best_value:.6f}")

    # ----- 베스트 파라미터로 K-Fold 재학습 (bootstrap_type 고정 유지) -----
    best_params = study.best_trial.params.copy()
    best_params.update({
        "iterations": CFG['CATBOOST_ITERATIONS'],
        "task_type": task_type,
        "eval_metric": "Logloss",
        "early_stopping_rounds": CFG['CATBOOST_EARLY_STOPPING'],
        "verbose": 200,
        "auto_class_weights": "Balanced",
        "random_seed": CFG['SEED'],

        # ★ 고정
        "bootstrap_type": "Bernoulli",
    })

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=CFG['SEED'])
    fold_models, fold_scores, oof_pred = [], [], np.zeros(len(X), dtype=np.float32)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n{'='*18} FOLD {fold}/{n_folds} {'='*18}")
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = CatBoostClassifier(**best_params)
        if cat_feature_indices:
            model.fit(X_tr, y_tr, eval_set=(X_va, y_va), cat_features=cat_feature_indices)
        else:
            model.fit(X_tr, y_tr, eval_set=(X_va, y_va))

        pred = model.predict_proba(X_va)[:, 1]
        oof_pred[va_idx] = pred
        m = calculate_competition_score(y_va, pred)
        print(f"Fold {fold} Final: {m['final_score']:.6f} | AP: {m['ap']:.6f} | WLL: {m['wll']:.6f}")

        fold_scores.append(m['final_score'])
        fold_models.append(model)

    oof_metrics = calculate_competition_score(y, oof_pred)
    print("\n" + "="*70)
    print("K-Fold Final Summary")
    print("="*70)
    print(f"Mean Final: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
    print(f"OOF  Final: {oof_metrics['final_score']:.6f} | AP: {oof_metrics['ap']:.6f} | WLL: {oof_metrics['wll']:.6f}")

    return fold_models, {
        'mean_final_score': float(np.mean(fold_scores)),
        'oof_score': float(oof_metrics['final_score']),
        'best_params': best_params
    }

## Optuna로 찾은 최적의 하이퍼파라미터로 최종예측

In [5]:

# # =========================================================
# # 실행: Optuna 기반 K-Fold CatBoost 학습
# # =========================================================
# print("\n=== Optuna 기반 K-Fold CatBoost 학습 시작 ===")
# opt_models, opt_results = catboost_kfold_optuna(
#     train_features,
#     feature_cols,
#     target_col,
#     cat_feature_indices=cat_feature_indices,   # 카테고리 인덱스 반영
#     n_folds=CFG['N_FOLDS'],
#     n_trials=CFG['N_TRIALS'],
#     use_gpu=True                               # GPU 없으면 False
# )

# # =========================================================
# # 추론 & 제출 저장 (평균 앙상블)
# # =========================================================
# print("\n=== Test Inference (K-Fold Avg) ===")
# X_test = test_features[feature_cols]
# preds = []
# for i, m in enumerate(opt_models, 1):
#     p = m.predict_proba(X_test)[:, 1]
#     preds.append(p)
#     print(f"  fold {i} done.")
# pred_test = np.mean(preds, axis=0)

# # 제출 저장
# submit = pd.read_csv(SAMPLE_SUB)
# submit['clicked'] = np.clip(pred_test, 1e-7, 1-1e-7)
# ts = datetime.now().strftime("%Y%m%d_%H%M%S")
# save_path = f"/content/drive/MyDrive/open/submit_catboost_optuna_fromPKL_{ts}.csv"
# submit.to_csv(save_path, index=False)
# print(f"\n[Saved] Submission -> {save_path}")

# # 모델 저장
# MODEL_DIR = "/content/drive/MyDrive/open/models/catboost_optuna_fromPKL33"
# os.makedirs(MODEL_DIR, exist_ok=True)
# for i, m in enumerate(opt_models, 1):
#     m.save_model(os.path.join(MODEL_DIR, f"catboost_opt_fromPKL_fold{i}.cbm"))
# print(f"[Saved] {len(opt_models)} fold models -> {MODEL_DIR}")
# print("\nDone.")

## optuna로 찾은 값으로 고정 후 5Fold학습 및 추론

[I 2025-09-11 06:33:56,943] Trial 35 finished with value: 0.5087410908919996 and parameters: {'learning_rate': 0.014178770856256436, 'depth': 10, 'l2_leaf_reg': 7.668375095292063, 'border_count': 233, 'min_data_in_leaf': 42, 'random_strength': 0.644833461937768, 'subsample': 0.7788487816688026}

In [8]:
# =========================================================
# K-Fold CatBoost (고정 하이퍼파라미터 + Seed Ensemble)
#  - 시드별 KFold 학습 → OOF 요약
#  - 테스트 예측: Seed × Fold 평균
#  - 모델 저장: catboost_fixed_seed{seed}_fold{fold}.cbm
# =========================================================
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np, pandas as pd
from datetime import datetime
import os

# 고정 하이퍼파라미터 (Trial 14)
BASE_PARAMS = {
    "iterations": 5000,
    "learning_rate": 0.014178770856256436,
    "depth": 10,
    "l2_leaf_reg": 7.668375095292063,
    "border_count": 233,
    "min_data_in_leaf": 42,
    "random_strength": 0.644833461937768,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.7788487816688026,  # Bernoulli일 때만 사용
    "task_type": "GPU",               # GPU 없으면 "CPU"
    "eval_metric": "Logloss",
    "early_stopping_rounds": CFG['CATBOOST_EARLY_STOPPING'],
    "auto_class_weights": "Balanced",
    # "random_seed": CFG['SEED'],     # ← 시드 앙상블에서 seed마다 주입하므로 여기선 제거
    "verbose": 200
}

SEEDS = [0, 1, 2, 3, 4]

print("\n=== K-Fold CatBoost (Fixed params + Seed Ensemble) 시작 ===")
X = train_features[feature_cols]
y = train_features[target_col].astype(int).values

all_models = []            # [[fold_models for seed0], [fold_models for seed1], ...]
oof_pred_total = np.zeros(len(X), dtype=np.float32)
per_seed_rows = []         # 시드별 OOF 요약 저장

for seed in SEEDS:
    print(f"\n##### Seed {seed} #####")
    params = BASE_PARAMS.copy()
    params["random_seed"] = seed

    skf = StratifiedKFold(n_splits=CFG['N_FOLDS'], shuffle=True, random_state=seed)
    fold_models, fold_scores = [], []
    oof_pred = np.zeros(len(X), dtype=np.float32)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n{'='*18} SEED {seed} | FOLD {fold}/{CFG['N_FOLDS']} {'='*18}")
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = CatBoostClassifier(**params)
        if cat_feature_indices:
            model.fit(X_tr, y_tr, eval_set=(X_va, y_va), cat_features=cat_feature_indices)
        else:
            model.fit(X_tr, y_tr, eval_set=(X_va, y_va))

        pred = model.predict_proba(X_va)[:, 1]
        oof_pred[va_idx] = pred

        m = calculate_competition_score(y_va, pred)
        print(f"Fold {fold} Final: {m['final_score']:.6f} | AP: {m['ap']:.6f} | WLL: {m['wll']:.6f}")

        fold_scores.append(m['final_score'])
        fold_models.append(model)

    # 시드별 OOF 요약
    oof_metrics = calculate_competition_score(y, oof_pred)
    print("\n" + "="*70)
    print(f"Seed {seed} Summary (CatBoost Fixed params)")
    print("="*70)
    print(f"Mean Final: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
    print(f"OOF  Final: {oof_metrics['final_score']:.6f} | AP: {oof_metrics['ap']:.6f} | WLL: {oof_metrics['wll']:.6f}")

    per_seed_rows.append({
        "seed": seed,
        "mean_final": float(np.mean(fold_scores)),
        "std_final": float(np.std(fold_scores)),
        "oof_final": float(oof_metrics['final_score']),
        "oof_ap": float(oof_metrics['ap']),
        "oof_wll": float(oof_metrics['wll']),
    })

    all_models.append(fold_models)
    oof_pred_total += oof_pred / len(SEEDS)   # 시드 평균 반영

# 시드별 요약 출력/저장
seed_summary = pd.DataFrame(per_seed_rows).sort_values("seed")
print("\n===== Per-Seed OOF Summary (CatBoost Fixed) =====")
print(seed_summary.to_string(index=False))

summary_path = "/content/drive/MyDrive/open/catboost_fixed_seed_summary.csv"
os.makedirs(os.path.dirname(summary_path), exist_ok=True)
seed_summary.to_csv(summary_path, index=False)
print(f"[Saved] Per-seed summary -> {summary_path}")

# 전체 OOF (시드 평균)
overall_oof = calculate_competition_score(y, oof_pred_total)
print("\n===== Overall OOF (Averaged over seeds) =====")
print(f"Final: {overall_oof['final_score']:.6f} | AP: {overall_oof['ap']:.6f} | WLL: {overall_oof['wll']:.6f}")

# =========================================================
# 추론 & 제출 저장 (Seed × Fold 평균 앙상블)
# =========================================================
print("\n=== Test Inference (K-Fold Avg × Seed Avg) ===")
X_test = test_features[feature_cols]
preds = []
for s_idx, fold_models in enumerate(all_models):
    for f_idx, m in enumerate(fold_models, 1):
        p = m.predict_proba(X_test)[:, 1]
        preds.append(p)
        print(f"  seed {SEEDS[s_idx]} fold {f_idx} done.")

pred_test = np.mean(preds, axis=0)

# 제출 저장
submit = pd.read_csv(SAMPLE_SUB)
if 'clicked' not in submit.columns:
    submit['clicked'] = 0.5
submit['clicked'] = np.clip(pred_test, 1e-7, 1-1e-7)

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = f"/content/drive/MyDrive/open/submit_catboost_FIXED_seedEnsemble_{ts}.csv"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
submit.to_csv(save_path, index=False)
print(f"\n[Saved] Submission -> {save_path}")

# 모델 저장 (seed, fold 단위)
MODEL_DIR = "/content/drive/MyDrive/open/models/catboost_fixed_seedEnsemble"
os.makedirs(MODEL_DIR, exist_ok=True)
for s_idx, fold_models in enumerate(all_models):
    for f_idx, m in enumerate(fold_models, 1):
        m.save_model(os.path.join(MODEL_DIR, f"catboost_fixed_seed{SEEDS[s_idx]}_fold{f_idx}.cbm"))
print(f"[Saved] {len(SEEDS)*CFG['N_FOLDS']} models -> {MODEL_DIR}")
print("\nDone.")


=== K-Fold CatBoost (Fixed params + Seed Ensemble) 시작 ===

##### Seed 0 #####

0:	learn: 0.6910027	test: 0.6909722	best: 0.6909722 (0)	total: 103ms	remaining: 8m 36s
200:	learn: 0.6049323	test: 0.6060899	best: 0.6060899 (200)	total: 17.9s	remaining: 7m 7s
400:	learn: 0.5951315	test: 0.6000496	best: 0.6000496 (400)	total: 36.2s	remaining: 6m 54s
600:	learn: 0.5885560	test: 0.5977739	best: 0.5977739 (600)	total: 54.4s	remaining: 6m 38s
800:	learn: 0.5830455	test: 0.5964651	best: 0.5964640 (799)	total: 1m 12s	remaining: 6m 21s
1000:	learn: 0.5774409	test: 0.5955441	best: 0.5955441 (1000)	total: 1m 30s	remaining: 6m 2s
1200:	learn: 0.5716668	test: 0.5948391	best: 0.5948391 (1200)	total: 1m 48s	remaining: 5m 43s
1400:	learn: 0.5661964	test: 0.5943425	best: 0.5943425 (1400)	total: 2m 6s	remaining: 5m 25s
1600:	learn: 0.5608378	test: 0.5939791	best: 0.5939791 (1600)	total: 2m 24s	remaining: 5m 7s
1800:	learn: 0.5557175	test: 0.5936661	best: 0.5936661 (1800)	total: 2m 42s	remaining: 4m 48s
20