# LightGBM Booking Probability Prediction with Optuna Tuning
This notebook demonstrates how to train a LightGBM model that predicts the probability a hotel will be booked. We use Optuna to tune hyperparameters with cross‑validation, evaluate the tuned model, and optionally explain it with SHAP.

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import shap
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


## 2. Configuration
Adjust the paths and parameters below to match your environment and dataset.

In [2]:
# === File paths ===
TRAIN_CSV = '/Users/s.broos/Documents/DMT_data/training_set_VU_DM.csv'            # TODO: replace with your training file
OUT_FEAT_CSV = '/Users/s.broos/Documents/DMT_data/features_with_cf_score.csv'  # TODO: replace with your engineered‑features file

# === Column names ===
LABEL_CLICK = 'click_bool'
LABEL_BOOK = 'booking_bool'
LEAKS = ['position', 'gross_bookings_usd']     # columns that leak target information
REMOVE_COLS = []                               # extra columns you want dropped

# === Sampling ===
SAMPLE_FRAC = 1.0   # set <1.0 for quick tests
RANDOM_STATE = 42


## 3. Helper functions

In [3]:
def engineer_light(df: pd.DataFrame) -> pd.DataFrame:
    """Simple numeric imputation with medians."""
    df = df.copy()
    num_cols = df.select_dtypes('number').columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    return df


## 4. Load & prepare data

In [4]:
# 4.1 Load raw data
df_full = pd.read_csv(TRAIN_CSV)
df_full = engineer_light(df_full)

# 4.2 Labels
y = df_full[LABEL_BOOK]

# 4.3 Base feature matrix
drop_cols = [LABEL_CLICK, LABEL_BOOK] + LEAKS + ['date_time']
X_base = df_full.drop(columns=drop_cols)
X_base = X_base.drop(columns=[c for c in REMOVE_COLS if c in X_base.columns])

# 4.4 Merge candidate engineered features
cand = pd.read_csv(OUT_FEAT_CSV)
X_full = X_base.merge(cand, on=['srch_id', 'prop_id'], how='left')

# 4.5 (Optional) sample for speed
X_full = X_full.sample(frac=SAMPLE_FRAC, random_state=RANDOM_STATE)
y = y.loc[X_full.index].reset_index(drop=True)
X_full = X_full.reset_index(drop=True)

print(f'Full feature matrix shape: {X_full.shape}')


Full feature matrix shape: (4958347, 66)


### 4.6 Train / test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y,
    test_size=0.3,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f'Train size: {X_train.shape},  Test size: {X_test.shape}')


Train size: (3470842, 66),  Test size: (1487505, 66)


## 5. Hyperparameter optimisation with Optuna

In [14]:
# ------------------------------------------------------------------
# Fast LightGBM + Optuna tuning block
# ------------------------------------------------------------------
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# ---------- fast-tune hyper-parameters ----------
SAMPLE_FRAC   = 0.60      # more data
NUM_BOOST_CAP = 1200      # more trees
EARLY_STOP    = 40        # patience
N_TRIALS      = 30        # more Optuna shots

# ---------- one-time sampling for speed ----------
X_tune, _, y_tune, _ = train_test_split(
    X_train, y_train,
    test_size=1 - SAMPLE_FRAC,
    stratify=y_train,
    random_state=RANDOM_STATE
)

# ---------- Optuna objective ----------
def objective(trial):
    # --- 80 / 20 validation split ---
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_tune, y_tune,
        test_size=0.20,
        stratify=y_tune,
        random_state=RANDOM_STATE
    )
    dtr  = lgb.Dataset(X_tr,  label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtr)

    # --- compact search space ---
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "seed": RANDOM_STATE,
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": 1,
        # "device_type": "gpu",   # ← uncomment if you have CUDA
    }

    booster = lgb.train(
        params,
        dtr,
        num_boost_round=NUM_BOOST_CAP,
        valid_sets=[dval],
        callbacks=[
            lgb.early_stopping(EARLY_STOP),
            LightGBMPruningCallback(trial, "auc"),
            lgb.log_evaluation(period=0),   # silent
        ],
    )

    preds = booster.predict(X_val, num_iteration=booster.best_iteration)
    return roc_auc_score(y_val, preds)

# ---------- run the study ----------
study = optuna.create_study(direction="maximize")
study.optimize(
    objective,
    n_trials=N_TRIALS,
    n_jobs=-1,                # use all available CPU cores
    show_progress_bar=True
)

print("Best AUC (CV):", study.best_value)
print("Best params :", study.best_params)


[I 2025-05-14 17:40:58,313] A new study created in memory with name: no-name-b81805c3-2bbe-45b7-a37b-6f13fdc3989d
  0%|          | 0/30 [00:00<?, ?it/s]

Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.913346


Best trial: 1. Best value: 0.913346:   3%|▎         | 1/30 [01:30<43:51, 90.74s/it]

[I 2025-05-14 17:42:29,046] Trial 1 finished with value: 0.913346239236184 and parameters: {'learning_rate': 0.16866861590324983, 'num_leaves': 147, 'min_child_samples': 28, 'feature_fraction': 0.7426796664344582, 'bagging_fraction': 0.6322446513624453}. Best is trial 1 with value: 0.913346239236184.
Early stopping, best iteration is:
[78]	valid_0's auc: 0.917013


Best trial: 5. Best value: 0.917013:   7%|▋         | 2/30 [01:45<21:21, 45.76s/it]

[I 2025-05-14 17:42:43,030] Trial 5 finished with value: 0.9170127134106214 and parameters: {'learning_rate': 0.13154554628525342, 'num_leaves': 43, 'min_child_samples': 36, 'feature_fraction': 0.7860362078243486, 'bagging_fraction': 0.9596818556624573}. Best is trial 5 with value: 0.9170127134106214.
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.916606


Best trial: 5. Best value: 0.917013:  10%|█         | 3/30 [01:59<14:11, 31.55s/it]

[I 2025-05-14 17:42:57,973] Trial 0 finished with value: 0.9166062119961964 and parameters: {'learning_rate': 0.14469695775160363, 'num_leaves': 75, 'min_child_samples': 76, 'feature_fraction': 0.6670272887744738, 'bagging_fraction': 0.8013713764300585}. Best is trial 5 with value: 0.9170127134106214.
Training until validation scores don't improve for 40 rounds
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[182]	valid_0's auc: 0.917838


Best trial: 6. Best value: 0.917838:  13%|█▎        | 4/30 [02:53<17:33, 40.54s/it]

[I 2025-05-14 17:43:52,276] Trial 6 finished with value: 0.9178376959062575 and parameters: {'learning_rate': 0.0981004702785791, 'num_leaves': 54, 'min_child_samples': 99, 'feature_fraction': 0.8578905406944405, 'bagging_fraction': 0.8497728121836986}. Best is trial 6 with value: 0.9178376959062575.
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[155]	valid_0's auc: 0.917918


Best trial: 8. Best value: 0.917918:  17%|█▋        | 5/30 [04:20<23:45, 57.03s/it]

[I 2025-05-14 17:45:18,494] Trial 8 finished with value: 0.9179182192726577 and parameters: {'learning_rate': 0.07134114782088614, 'num_leaves': 74, 'min_child_samples': 71, 'feature_fraction': 0.836872087862134, 'bagging_fraction': 0.7775885186661563}. Best is trial 8 with value: 0.9179182192726577.


Best trial: 8. Best value: 0.917918:  23%|██▎       | 7/30 [04:20<09:44, 25.42s/it]

[I 2025-05-14 17:45:18,586] Trial 11 pruned. Trial was pruned at iteration 44.
[I 2025-05-14 17:45:18,752] Trial 10 pruned. Trial was pruned at iteration 142.


Best trial: 8. Best value: 0.917918:  27%|██▋       | 8/30 [04:43<08:59, 24.54s/it]

Training until validation scores don't improve for 40 rounds
[I 2025-05-14 17:45:41,529] Trial 12 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  30%|███       | 9/30 [04:46<06:18, 18.04s/it]

[I 2025-05-14 17:45:45,266] Trial 13 pruned. Trial was pruned at iteration 3.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  33%|███▎      | 10/30 [05:11<06:39, 19.98s/it]

[I 2025-05-14 17:46:09,604] Trial 14 pruned. Trial was pruned at iteration 28.


Best trial: 8. Best value: 0.917918:  37%|███▋      | 11/30 [05:13<04:38, 14.68s/it]

[I 2025-05-14 17:46:12,262] Trial 16 pruned. Trial was pruned at iteration 0.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  40%|████      | 12/30 [05:35<05:03, 16.85s/it]

[I 2025-05-14 17:46:34,092] Trial 17 pruned. Trial was pruned at iteration 3.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  43%|████▎     | 13/30 [05:42<03:52, 13.70s/it]

[I 2025-05-14 17:46:40,419] Trial 18 pruned. Trial was pruned at iteration 7.


Best trial: 8. Best value: 0.917918:  47%|████▋     | 14/30 [05:51<03:18, 12.41s/it]

[I 2025-05-14 17:46:49,849] Trial 15 pruned. Trial was pruned at iteration 65.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  50%|█████     | 15/30 [05:58<02:42, 10.83s/it]

[I 2025-05-14 17:46:57,133] Trial 19 pruned. Trial was pruned at iteration 2.
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  53%|█████▎    | 16/30 [06:08<02:25, 10.40s/it]

[I 2025-05-14 17:47:06,421] Trial 20 pruned. Trial was pruned at iteration 3.


Best trial: 8. Best value: 0.917918:  57%|█████▋    | 17/30 [06:16<02:05,  9.64s/it]

[I 2025-05-14 17:47:14,386] Trial 21 pruned. Trial was pruned at iteration 0.
Early stopping, best iteration is:
[455]	valid_0's auc: 0.918726


Best trial: 8. Best value: 0.917918:  60%|██████    | 18/30 [06:25<01:54,  9.56s/it]

[I 2025-05-14 17:47:23,651] Trial 22 pruned. Trial was pruned at iteration 0.
Early stopping, best iteration is:
[452]	valid_0's auc: 0.918521


Best trial: 8. Best value: 0.917918:  63%|██████▎   | 19/30 [06:33<01:40,  9.12s/it]

[I 2025-05-14 17:47:31,853] Trial 23 pruned. Trial was pruned at iteration 0.
Early stopping, best iteration is:
[221]	valid_0's auc: 0.918347
Training until validation scores don't improve for 40 rounds


Best trial: 8. Best value: 0.917918:  67%|██████▋   | 20/30 [06:48<01:48, 10.90s/it]

[I 2025-05-14 17:47:46,917] Trial 24 pruned. Trial was pruned at iteration 3.


Best trial: 2. Best value: 0.918726:  70%|███████   | 21/30 [06:49<01:11,  7.99s/it]

[I 2025-05-14 17:47:48,123] Trial 2 finished with value: 0.9187261423038111 and parameters: {'learning_rate': 0.03273981313509022, 'num_leaves': 92, 'min_child_samples': 34, 'feature_fraction': 0.8346045239803968, 'bagging_fraction': 0.8405963158646577}. Best is trial 2 with value: 0.9187261423038111.
Training until validation scores don't improve for 40 rounds


Best trial: 2. Best value: 0.918726:  73%|███████▎  | 22/30 [06:55<00:58,  7.31s/it]

[I 2025-05-14 17:47:53,604] Trial 9 finished with value: 0.9183466615080449 and parameters: {'learning_rate': 0.049430321837007345, 'num_leaves': 162, 'min_child_samples': 97, 'feature_fraction': 0.7374864389952597, 'bagging_fraction': 0.769328188942748}. Best is trial 2 with value: 0.9187261423038111.


Best trial: 2. Best value: 0.918726:  77%|███████▋  | 23/30 [06:56<00:38,  5.53s/it]

[I 2025-05-14 17:47:55,209] Trial 3 finished with value: 0.9185207341857595 and parameters: {'learning_rate': 0.03724026891002677, 'num_leaves': 108, 'min_child_samples': 98, 'feature_fraction': 0.8685425797682383, 'bagging_fraction': 0.7295036626644583}. Best is trial 2 with value: 0.9187261423038111.
Training until validation scores don't improve for 40 rounds


Best trial: 2. Best value: 0.918726:  80%|████████  | 24/30 [07:03<00:35,  6.00s/it]

[I 2025-05-14 17:48:02,303] Trial 25 pruned. Trial was pruned at iteration 10.
Training until validation scores don't improve for 40 rounds


Best trial: 2. Best value: 0.918726:  83%|████████▎ | 25/30 [07:11<00:32,  6.42s/it]

Training until validation scores don't improve for 40 rounds
[I 2025-05-14 17:48:09,550] Trial 26 pruned. Trial was pruned at iteration 16.


Best trial: 2. Best value: 0.918726:  83%|████████▎ | 25/30 [07:12<00:32,  6.42s/it]

[I 2025-05-14 17:48:11,047] Trial 28 pruned. Trial was pruned at iteration 3.


Best trial: 2. Best value: 0.918726:  90%|█████████ | 27/30 [07:13<00:10,  3.64s/it]

[I 2025-05-14 17:48:11,858] Trial 27 pruned. Trial was pruned at iteration 3.
Training until validation scores don't improve for 40 rounds


Best trial: 2. Best value: 0.918726:  93%|█████████▎| 28/30 [07:15<00:05,  2.98s/it]

[I 2025-05-14 17:48:13,314] Trial 29 pruned. Trial was pruned at iteration 1.
Early stopping, best iteration is:
[714]	valid_0's auc: 0.91872


Best trial: 2. Best value: 0.918726:  97%|█████████▋| 29/30 [07:26<00:05,  5.48s/it]

[I 2025-05-14 17:48:24,642] Trial 7 finished with value: 0.9187197118735517 and parameters: {'learning_rate': 0.0265228313275646, 'num_leaves': 50, 'min_child_samples': 63, 'feature_fraction': 0.8695287429670948, 'bagging_fraction': 0.667308395865633}. Best is trial 2 with value: 0.9187261423038111.
Early stopping, best iteration is:
[785]	valid_0's auc: 0.919291


Best trial: 4. Best value: 0.919291: 100%|██████████| 30/30 [07:57<00:00, 15.91s/it]

[I 2025-05-14 17:48:55,472] Trial 4 finished with value: 0.9192906198175425 and parameters: {'learning_rate': 0.02340256277784342, 'num_leaves': 72, 'min_child_samples': 69, 'feature_fraction': 0.6719019183257495, 'bagging_fraction': 0.8929756582704587}. Best is trial 4 with value: 0.9192906198175425.
Best AUC (CV): 0.9192906198175425
Best params : {'learning_rate': 0.02340256277784342, 'num_leaves': 72, 'min_child_samples': 69, 'feature_fraction': 0.6719019183257495, 'bagging_fraction': 0.8929756582704587}





Best trial: 4. Best value: 0.919291: 100%|██████████| 30/30 [07:57<00:00, 15.91s/it]
[I 2025-05-14 17:48:55,472] Trial 4 finished with value: 0.9192906198175425 and parameters: {'learning_rate': 0.02340256277784342, 'num_leaves': 72, 'min_child_samples': 69, 'feature_fraction': 0.6719019183257495, 'bagging_fraction': 0.8929756582704587}. Best is trial 4 with value: 0.9192906198175425.
Best AUC (CV): 0.9192906198175425
Best params : {'learning_rate': 0.02340256277784342, 'num_leaves': 72, 'min_child_samples': 69, 'feature_fraction': 0.6719019183257495, 'bagging_fraction': 0.8929756582704587}


## 6. Train final model with tuned parameters

In [18]:
# ============ FINAL TRAINING (all data) ============

import lightgbm as lgb
import joblib, os

# 1. Merge every row you have
X_full = pd.concat([X_train, X_test]).reset_index(drop=True)
y_full = pd.concat([y_train, y_test]).reset_index(drop=True)
dall   = lgb.Dataset(X_full, label=y_full)

# 2. Params from Optuna
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "verbosity": -1,
    "seed": RANDOM_STATE,
})

cv_results = lgb.cv(
    best_params,
    dall,
    num_boost_round=4000,          # hard cap
    nfold=3,                       # 3 folds instead of 5
    stratified=True,
    shuffle=True,
    callbacks=[
        lgb.early_stopping(100),   # patience halved
        lgb.log_evaluation(period=200),
    ],
)
best_iter = len(cv_results["auc-mean"])
print(f"\nBest_iteration from CV: {best_iter}")

# 4. Fit one model on 100 % of the data
gbm_final = lgb.train(
    best_params,
    dall,
    num_boost_round=best_iter,
    valid_sets=[dall],
    verbose_eval=False
)

# 5. Save for production use
MODEL_PATH = "lightgbm_full_final.pkl"
joblib.dump(gbm_final, MODEL_PATH)
print(f"✅  Model saved to {os.path.abspath(MODEL_PATH)}")


KeyboardInterrupt: 

In [None]:
# ============ EVALUATION (OOF metrics) ============

import numpy as np
from sklearn.metrics import (
    roc_auc_score, average_precision_score, log_loss,
    brier_score_loss, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

# 1. Collect OOF predictions from the cvbooster
oof_pred = np.zeros(len(X_full))
for booster, (_, val_idx) in zip(cv_results["cvbooster"].boosters, cv_results["cvbooster"].folds):
    oof_pred[val_idx] = booster.predict(
        X_full.iloc[val_idx],
        num_iteration=booster.best_iteration
    )

# 2. Threshold → class labels (0.50 default)
y_pred = (oof_pred >= 0.50).astype(int)

# 3. Metrics table
print("──────────  CV Out-of-Fold Metrics  ──────────")
print(f"AUC-ROC           : {roc_auc_score(y_full, oof_pred):.4f}")
print(f"PR-AUC            : {average_precision_score(y_full, oof_pred):.4f}")
print(f"Log-loss          : {log_loss(y_full, oof_pred):.4f}")
print(f"Brier score       : {brier_score_loss(y_full, oof_pred):.4f}")
print("---------------------------------------------")
print(f"Accuracy (0.5)    : {accuracy_score(y_full, y_pred):.4f}")
print(f"Precision (0.5)   : {precision_score(y_full, y_pred, zero_division=0):.4f}")
print(f"Recall    (0.5)   : {recall_score(y_full, y_pred, zero_division=0):.4f}")
print(f"F1-score  (0.5)   : {f1_score(y_full, y_pred, zero_division=0):.4f}")
tn, fp, fn, tp = confusion_matrix(y_full, y_pred).ravel()
print(f"Confusion matrix  :  TN={tn}  FP={fp}  FN={fn}  TP={tp}")
print("──────────────────────────────────────────────")


## 7. Interpretability with SHAP
Uncomment the summary‑plot line if you wish to visualise SHAP values inside the notebook.

In [None]:
explainer = shap.TreeExplainer(gbm_final)
shap_values = explainer.shap_values(X_test.fillna(0))

# shap.summary_plot(shap_values, X_test, max_display=20)


---
*End of notebook*