# XGBoost Ranker with Optuna Hyperparameter Tuning

In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import warnings
warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'optuna'

In [5]:
!pip install optuna
!pip install xgboost




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\nikak\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\nikak\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [10]:
# === File paths ===
train_imputed = '/Users/nikak/Desktop/vubaby/DMT/data_DMT/train_imputed_CSV.csv'            # TODO: replace with your training file
feats_train = '/Users/nikak/Desktop/vubaby/DMT/data_DMT/nika_train_feats_with_cf.csv'  # TODO: replace with your engineered‑features file


In [None]:
df = pd.read_csv(train_imputed)  # full labeled data
features = pd.read_csv(feats_train)  # your features


# df = df.merge(features, on=['srch_id', 'prop_id', 'srch_destination_id'], how='left')

def label_func(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    else:
        return 0

df['label'] = df.apply(label_func, axis=1)

# group_sizes = df.groupby('srch_id').size().to_list()

drop_cols = ['click_bool', 'booking_bool', 'date_time', 'gross_bookings_usd', 'position']
X = df.drop(columns=[*drop_cols, 'label'], errors='ignore')
y = df['label']

df = df.merge(features, on=['srch_id', 'prop_id', 'srch_destination_id'], how='left') #MAYBE REMOVE scrh_destination_id or comment this one and uncomment the one above
group_sizes = df.groupby('srch_id').size().to_list()


MemoryError: Unable to allocate 1.26 GiB for an array with shape (34, 4958347) and data type float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
def objective(trial):
    params = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg",
        "tree_method": "hist",
        "booster": "gbtree",
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    model = xgb.train(params, dtrain, num_boost_round=300, evals=[(dtest, "test")], early_stopping_rounds=50, verbose_eval=False)

    srch_ids_test = X_test['srch_id']
    group_test = srch_ids_test.value_counts().sort_index().to_list()
    split_preds = np.split(model.predict(dtest), np.cumsum(group_test)[:-1])
    split_true = np.split(y_test.to_numpy(), np.cumsum(group_test)[:-1])
    ndcgs = [ndcg_score([yt], [yp], k=5) for yt, yp in zip(split_true, split_preds) if len(yt) >= 5]
    return np.mean(ndcgs)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)


[I 2025-05-15 23:58:40,474] A new study created in memory with name: no-name-9be8ddf4-2883-4b03-b06c-183e7b5a3c4c
[I 2025-05-15 23:58:42,181] Trial 0 finished with value: 0.1239684385644996 and parameters: {'eta': 0.1628147837821737, 'max_depth': 8, 'min_child_weight': 6, 'gamma': 4.792870619247334, 'subsample': 0.9799037348113369, 'colsample_bytree': 0.9497803654880639}. Best is trial 0 with value: 0.1239684385644996.
[I 2025-05-15 23:58:47,453] Trial 1 finished with value: 0.17248657095396647 and parameters: {'eta': 0.10606831451991225, 'max_depth': 4, 'min_child_weight': 5, 'gamma': 2.5149498258314362, 'subsample': 0.8941212665403985, 'colsample_bytree': 0.6529411906198936}. Best is trial 1 with value: 0.17248657095396647.
[I 2025-05-15 23:58:50,491] Trial 2 finished with value: 0.16295271314997598 and parameters: {'eta': 0.041926498103437904, 'max_depth': 8, 'min_child_weight': 7, 'gamma': 1.4414204573058853, 'subsample': 0.8900724252324879, 'colsample_bytree': 0.6618341283529696}.

In [10]:
best_params = study.best_params
best_params.update({
    "objective": "rank:pairwise",
    "eval_metric": "ndcg",
    "tree_method": "hist",
    "booster": "gbtree"
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

model = xgb.train(best_params, dtrain_final, num_boost_round=300, evals=[(dtest_final, "test")], early_stopping_rounds=30)

print("Final model trained with best parameters from Optuna.")


[0]	test-ndcg:0.11172
[1]	test-ndcg:0.22037
[2]	test-ndcg:0.02509
[3]	test-ndcg:0.02509
[4]	test-ndcg:0.02509
[5]	test-ndcg:0.02509
[6]	test-ndcg:0.02509
[7]	test-ndcg:0.02509
[8]	test-ndcg:0.02747
[9]	test-ndcg:0.05453
[10]	test-ndcg:0.14970
[11]	test-ndcg:0.06208
[12]	test-ndcg:0.10230
[13]	test-ndcg:0.10909
[14]	test-ndcg:0.19451
[15]	test-ndcg:0.19451
[16]	test-ndcg:0.19451
[17]	test-ndcg:0.19451
[18]	test-ndcg:0.19451
[19]	test-ndcg:0.19628
[20]	test-ndcg:0.19628
[21]	test-ndcg:0.19628
[22]	test-ndcg:0.19628
[23]	test-ndcg:0.19628
[24]	test-ndcg:0.19628
[25]	test-ndcg:0.19628
[26]	test-ndcg:0.19628
[27]	test-ndcg:0.19628
[28]	test-ndcg:0.19628
[29]	test-ndcg:0.19628
[30]	test-ndcg:0.19628
Final model trained with best parameters from Optuna.


In [11]:
model.save_model("xgboost_ranker_model_final.json")
print("Model saved to 'xgboost_ranker_model_final.json'")


Model saved to 'xgboost_ranker_model_final.json'
