# XGBoost Ranker with Optuna Hyperparameter Tuning

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# === File paths ===
train_imputed = '/Users/nikak/Desktop/vubaby/DMT/data_DMT/train_imputed_CSV.csv'            # TODO: replace with your training file
feats_train = '/Users/nikak/Desktop/vubaby/DMT/data_DMT/nika_train_feats_with_cf.csv'  # TODO: replace with your engineered‑features file


In [None]:
df = pd.read_pickle(TRAIN_CSV)  # full labeled data
features = pd.read_csv(OUT_FEAT_CSV)  # your features

df = df.merge(features, on=['srch_id', 'prop_id'], how='left')
# I needed to drop these because I cannot get them in test set --> data leakege
df = df.drop(columns=['prop_popularity', 'dest_popularity'], errors='ignore')

def label_func(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    else:
        return 0

df['label'] = df.apply(label_func, axis=1)

group_sizes = df.groupby('srch_id').size().to_list()

drop_cols = ['click_bool', 'booking_bool', 'date_time', 'gross_bookings_usd', 'position']
X = df.drop(columns=[*drop_cols, 'label'], errors='ignore')
y = df['label']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
def objective(trial):
    params = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg",
        "tree_method": "hist",
        "booster": "gbtree",
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    model = xgb.train(params, dtrain, num_boost_round=300, evals=[(dtest, "test")], early_stopping_rounds=30, verbose_eval=False)

    srch_ids_test = X_test['srch_id']
    group_test = srch_ids_test.value_counts().sort_index().to_list()
    split_preds = np.split(model.predict(dtest), np.cumsum(group_test)[:-1])
    split_true = np.split(y_test.to_numpy(), np.cumsum(group_test)[:-1])
    ndcgs = [ndcg_score([yt], [yp], k=5) for yt, yp in zip(split_true, split_preds) if len(yt) >= 5]
    return np.mean(ndcgs)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)


[I 2025-05-15 17:36:55,673] A new study created in memory with name: no-name-9c6b6757-171a-4866-8b1f-27506f184502
[I 2025-05-15 17:48:41,855] Trial 0 finished with value: 0.1717834132391705 and parameters: {'eta': 0.24390668819801578, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0.03429179508075697, 'subsample': 0.9674139349160809, 'colsample_bytree': 0.5842353601899908}. Best is trial 0 with value: 0.1717834132391705.
[W 2025-05-15 17:54:20,165] Trial 1 failed with parameters: {'eta': 0.1967766244247504, 'max_depth': 6, 'min_child_weight': 7, 'gamma': 0.27220988536187596, 'subsample': 0.6194080595791538, 'colsample_bytree': 0.5611453436585989} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\nikak\Desktop\vubaby\DMT\DMT\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\nikak\AppData\Local\Temp\ipykernel_30040\37483034

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
best_params.update({
    "objective": "rank:pairwise",
    "eval_metric": "ndcg",
    "tree_method": "hist",
    "booster": "gbtree"
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

model = xgb.train(best_params, dtrain_final, num_boost_round=300, evals=[(dtest_final, "test")], early_stopping_rounds=30)

print("✅ Final model trained with best parameters from Optuna.")


In [None]:
model.save_model("assignment_2/models/xgboost_ranker_model.json")
print("✅ Model saved to 'xgboost_ranker_model.json'")
