In [1]:
import pandas as pd
#import ydata_profiling as yp
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report,matthews_corrcoef,roc_auc_score
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv("/home/cloud/Datasets/Hotel Reservations_data_features.csv")

## Train test split the data

In [4]:
def train_test_split_and_scale(data):
    y = data["booking_status"]
    x = data.drop("booking_status", axis=1)
    features = list(x.columns)

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.20, random_state=0
    )

    return x_train, x_test, y_train, y_test, features

In [5]:
x_train, x_test, y_train, y_test, features = train_test_split_and_scale(data)

## Fit and evaluate the model

In [7]:
def fit_and_evaluate_random_forest(
    x_train, x_test, y_train, y_test,
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    random_state=42
):
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        class_weight=class_weight,
        random_state=random_state,
        n_jobs=-1
    )

    model = rf.fit(x_train, y_train)

    y_pred = rf.predict(x_test)
    y_prob = rf.predict_proba(x_test)[:, 1]

    rf_conf_matrix = confusion_matrix(y_test, y_pred)
    rf_acc_score = accuracy_score(y_test, y_pred)
    rf_mcc_score = matthews_corrcoef(y_test, y_pred)
    rf_auc_score = roc_auc_score(y_test, y_prob)

    print("=== RANDOM FOREST RESULTS ===")
    print("Confusion Matrix:")
    print(rf_conf_matrix)
    print(f"\nAccuracy: {rf_acc_score * 100:.2f}%")
    print(f"MCC Score: {rf_mcc_score * 100:.2f}%")
    print(f"AUC Score: {rf_auc_score * 100:.2f}%")
    print(classification_report(y_test, y_pred))

    return model

In [8]:
rf_model = fit_and_evaluate_random_forest(x_train, x_test, y_train, y_test)

=== RANDOM FOREST RESULTS ===
Confusion Matrix:
[[4575  279]
 [ 490 1911]]

Accuracy: 89.40%
MCC Score: 75.69%
AUC Score: 95.11%
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4854
           1       0.87      0.80      0.83      2401

    accuracy                           0.89      7255
   macro avg       0.89      0.87      0.88      7255
weighted avg       0.89      0.89      0.89      7255



## Tune key hyperparameters

In [10]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=5,
    class_weight='balanced'
)

In [11]:
rf_model = fit_and_evaluate_random_forest(x_train, x_test, y_train, y_test)

=== RANDOM FOREST RESULTS ===
Confusion Matrix:
[[4575  279]
 [ 490 1911]]

Accuracy: 89.40%
MCC Score: 75.69%
AUC Score: 95.11%
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4854
           1       0.87      0.80      0.83      2401

    accuracy                           0.89      7255
   macro avg       0.89      0.87      0.88      7255
weighted avg       0.89      0.89      0.89      7255



In [12]:
features_imp = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)

print(features_imp.head(10))

lead_time                      0.345879
avg_price_per_room             0.173880
no_of_special_requests         0.119287
no_of_week_nights              0.064293
no_of_weekend_nights           0.044621
market_segment_type_Online     0.028530
no_of_adults                   0.025416
arrival_month_12               0.017985
market_segment_type_Offline    0.017136
arrival_month_10               0.010035
dtype: float64


In [25]:
import joblib
joblib.dump(
    {
        "model": rf_model,
        "features": features
    },
    "/home/cloud/ML_Models/random_forest.pkl",compress=3
)

['/home/cloud/ML_Models/random_forest.pkl']