In [2]:
import pandas as pd
#import ydata_profiling as yp
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn import neighbors
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report,matthews_corrcoef,roc_auc_score
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [4]:
data = pd.read_csv("/home/cloud/Datasets/Hotel Reservations_data_features.csv")

## Train test split the data

In [7]:
def train_test_split_only(data):
    y = data["booking_status"]
    x = data.drop("booking_status", axis=1)
    features = list(x.columns)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0, stratify=y)
    return x_train, x_test, y_train, y_test, features

In [9]:
x_train, x_test, y_train, y_test, features = train_test_split_only(data)

In [11]:
def fit_and_evaluate_xgboost(
    x_train, x_test, y_train, y_test,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
):
    xgb = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=random_state,
        n_jobs=-1
    )

    model = xgb.fit(x_train, y_train)

    y_pred = xgb.predict(x_test)
    y_prob = xgb.predict_proba(x_test)[:, 1]

    print("=== XGBOOST RESULTS (NO SCALING) ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nAccuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
    print(f"MCC Score: {matthews_corrcoef(y_test, y_pred) * 100:.2f}%")
    print(f"AUC Score: {roc_auc_score(y_test, y_prob) * 100:.2f}%")
    print(classification_report(y_test, y_pred))

    return model

In [13]:
xgb_model = fit_and_evaluate_xgboost(x_train, x_test, y_train, y_test)

=== XGBOOST RESULTS (NO SCALING) ===
Confusion Matrix:
[[4580  298]
 [ 535 1842]]

Accuracy: 88.52%
MCC Score: 73.47%
AUC Score: 94.80%
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4878
           1       0.86      0.77      0.82      2377

    accuracy                           0.89      7255
   macro avg       0.88      0.86      0.87      7255
weighted avg       0.88      0.89      0.88      7255



## Handle class imbalance

In [16]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',
    eval_metric='logloss'
)

In [18]:
xgb_importance = pd.Series(
    xgb_model.feature_importances_,
    index=features
).sort_values(ascending=False)

print(xgb_importance.head(10))

market_segment_type_Online       0.172337
no_of_special_requests           0.068300
lead_time                        0.067887
required_car_parking_space       0.055608
arrival_month_12                 0.052156
arrival_month_1                  0.051600
type_of_meal_plan_Meal Plan 2    0.050386
market_segment_type_Offline      0.043296
repeated_guest                   0.030361
no_of_adults                     0.030258
dtype: float32


In [20]:
import joblib
joblib.dump(
    {
        "model": xgb_model,
        "features": features
    },
    "/home/cloud/ML_Models/xgboost.pkl"
)

['/home/cloud/ML_Models/xgboost.pkl']