# Model 3 â€” CatBoost (raw features, native categorical)

Goal:
- Train CatBoost on the same 13 features
- 5-fold Stratified CV AUC
- Save OOF preds + test preds
- Write Kaggle submission


In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp313-cp313-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.16.0-py3-none-any.whl.metadata (14 kB)
Downloading catboost-1.2.8-cp313-cp313-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
    --------------------------------------- 1.6/102.4 MB 9.4 MB/s eta 0:00:11
   - -------------------------------------- 5.0/102.4 MB 13.8 MB/s eta 0:00:08
   --- ------------------------------------ 9.4/102.4 MB 16.7 MB/s eta 0:00:06
   ----- ---------------------------------- 14.9/102.4 MB 19.4 MB/s eta 0:00:05
   ------- -------------------------------- 20.2/102.4 MB 20.9 MB/s eta 0:00:04
   ---------- ----------------------------- 26.0/102.4 MB 22.1 MB/

imports + load + define columns

In [2]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier, Pool

ROOT = Path.cwd().resolve().parents[0]  # notebooks/ -> repo root
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

id_col = sub.columns[0]
target_col = sub.columns[1]

y = (train[target_col] == "Presence").astype(int)

cat_cols = ['Sex', 'FBS over 120', 'Exercise angina', 'EKG results',
            'Slope of ST', 'Thallium', 'Number of vessels fluro', 'Chest pain type']
num_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

feature_cols = cat_cols + num_cols

X = train[feature_cols].copy()
X_test = test[feature_cols].copy()

# CatBoost needs categorical feature indices (column positions)
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

print("X:", X.shape, "X_test:", X_test.shape, "pos rate:", y.mean())
print("cat_idx:", cat_idx)


X: (630000, 13) X_test: (270000, 13) pos rate: 0.44833968253968254
cat_idx: [0, 1, 2, 3, 4, 5, 6, 7]


CatBoost CV (AUC + early stopping)

In [3]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X), dtype=float)
scores = []
best_iters = []

params = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=10000,          # large; early stopping finds best
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=10,
    random_strength=1.0,
    subsample=0.8,
    bootstrap_type="Bernoulli",
    od_type="Iter",
    od_wait=200,               # early stop patience
    random_seed=42,
    verbose=False,
    allow_writing_files=False  # keeps repo clean
)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    tr_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    va_pool = Pool(X_va, y_va, cat_features=cat_idx)

    model = CatBoostClassifier(**params)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    p_va = model.predict_proba(va_pool)[:, 1]
    oof[va_idx] = p_va

    auc = roc_auc_score(y_va, p_va)
    scores.append(auc)

    best_iter = model.get_best_iteration()
    best_iters.append(best_iter)

    print(f"Fold {fold} AUC: {auc:.5f} | best_iter: {best_iter}")

print("Mean CV AUC:", round(float(np.mean(scores)), 5))
print("OOF  AUC:", round(float(roc_auc_score(y, oof)), 5))
print("Avg best_iter:", int(np.mean(best_iters)))


Fold 1 AUC: 0.95577 | best_iter: 2507
Fold 2 AUC: 0.95475 | best_iter: 2450
Fold 3 AUC: 0.95553 | best_iter: 2391
Fold 4 AUC: 0.95512 | best_iter: 2721
Fold 5 AUC: 0.95596 | best_iter: 2776
Mean CV AUC: 0.95543
OOF  AUC: 0.95542
Avg best_iter: 2569


In [4]:
best_n = int(np.mean(best_iters))
print("Using iterations =", best_n)

final_params = dict(params)
final_params["iterations"] = best_n
final_params["verbose"] = False

final_model = CatBoostClassifier(**final_params)

full_pool = Pool(X, y, cat_features=cat_idx)
final_model.fit(full_pool)

test_pool = Pool(X_test, cat_features=cat_idx)
test_pred = final_model.predict_proba(test_pool)[:, 1]

# Save predictions for later blending
np.save(REPORTS_DIR / "oof_catboost.npy", oof)
np.save(REPORTS_DIR / "test_catboost.npy", test_pred)

submission = pd.DataFrame({
    id_col: test[id_col],
    target_col: test_pred
})

out_path = REPORTS_DIR / "sub_catboost.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
submission.head()


Using iterations = 2569
Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_catboost.csv


Unnamed: 0,id,Heart Disease
0,630000,0.95418
1,630001,0.006816
2,630002,0.988986
3,630003,0.003792
4,630004,0.206647
