# Model 2 â€” LightGBM (native categorical handling)

Goal:
- Train a tree boosting model
- Use Stratified 5-fold CV with AUC
- Use early stopping
- Generate a Kaggle submission


In [7]:
!pip install lightgbm




imports + load

In [8]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

id_col = sub.columns[0]
target_col = sub.columns[1]

y = (train[target_col] == "Presence").astype(int)

cat_cols = ['Sex', 'FBS over 120', 'Exercise angina', 'EKG results',
            'Slope of ST', 'Thallium', 'Number of vessels fluro', 'Chest pain type']
num_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

feature_cols = cat_cols + num_cols

X = train[feature_cols].copy()
X_test = test[feature_cols].copy()

print("X:", X.shape, "X_test:", X_test.shape, "pos rate:", y.mean())


X: (630000, 13) X_test: (270000, 13) pos rate: 0.44833968253968254


tell LightGBM which columns are categorical

In [9]:
for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")


CV with early stopping

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X), dtype=float)
scores = []
best_iters = []

params = dict(
    n_estimators=5000,       # big number; early stopping will pick the true best iteration
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = lgb.LGBMClassifier(**params)

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    p_va = model.predict_proba(X_va)[:, 1]
    oof[va_idx] = p_va

    auc = roc_auc_score(y_va, p_va)
    scores.append(auc)
    best_iters.append(model.best_iteration_)

    print(f"Fold {fold} AUC: {auc:.5f} | best_iter: {model.best_iteration_}")

print("Mean CV AUC:", round(float(np.mean(scores)), 5))
print("OOF  AUC:", round(float(roc_auc_score(y, oof)), 5))
print("Avg best_iter:", int(np.mean(best_iters)))


[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
Fold 1 AUC: 0.95547 | best_iter: 755
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 504000, numbe

Train final LightGBM model + generate submission

In [11]:
# Choose a good tree count for the final model.
# We use the average best iteration found by early stopping across folds.
best_n = int(np.mean(best_iters))
print("Using n_estimators =", best_n)

final_params = dict(params)
final_params["n_estimators"] = best_n

final_model = lgb.LGBMClassifier(**final_params)

# Fit on ALL training data
final_model.fit(X, y)

# Predict probabilities for test (positive class = Presence = 1)
test_pred = final_model.predict_proba(X_test)[:, 1]

# Build submission in Kaggle-required format
submission = pd.DataFrame({
    id_col: test[id_col],
    target_col: test_pred
})

out_path = REPORTS_DIR / "sub_lgbm.csv"
submission.to_csv(out_path, index=False)

print("Saved submission:", out_path)
submission.head()


Using n_estimators = 685
[LightGBM] [Info] Number of positive: 282454, number of negative: 347546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448340 -> initscore=-0.207381
[LightGBM] [Info] Start training from score -0.207381
Saved submission: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_lgbm.csv


Unnamed: 0,id,Heart Disease
0,630000,0.957345
1,630001,0.010071
2,630002,0.986503
3,630003,0.005318
4,630004,0.150325


In [12]:
np.save(REPORTS_DIR / "oof_lgbm.npy", oof)
np.save(REPORTS_DIR / "test_lgbm.npy", test_pred)

print("Saved:", REPORTS_DIR / "oof_lgbm.npy")
print("Saved:", REPORTS_DIR / "test_lgbm.npy")


Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\oof_lgbm.npy
Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\test_lgbm.npy
