# Hotel reservations cancellations prediction

In [1]:
# Standard library imports
import os
import zipfile

# Numerical and data manipulation libraries
import numpy as np
import pandas as pd

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# Metrics for model evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)

# Imbalanced data handling
from imblearn.over_sampling import SMOTE, RandomOverSampler

import pickle


In [2]:
df = pd.read_csv('hotel_reservations.csv')


<!-- README_INCLUDE -->
The database used for the model consists in a set of bookings from a hotel with a unique id called ¨boooking_id¨ from 2017 and 2018.

## 1. Data Preparation

Convert all column values into lower case and replace spaces

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [4]:
df['total_people']  = df['no_of_adults'] + df['no_of_children']
df['total_people'] .value_counts()

total_people
2     23942
1      7552
3      3851
4       912
5        15
12        1
10        1
11        1
Name: count, dtype: int64

In [5]:
df['price_per_adult'] = df['avg_price_per_room'] // df['no_of_adults']

In [6]:
df['price_per_person']  = df['avg_price_per_room'] // df['total_people']

In [7]:
df['has_prev_cancellations'] = df['no_of_previous_cancellations'] > 0

In [8]:
df['has_prev_bookings_not_cancelled'] = df['no_of_previous_bookings_not_canceled'] > 0

In [9]:
df['total_nights'] = df['no_of_weekend_nights'] + df['no_of_week_nights']

In [10]:
df['have_children'] = df['no_of_children'] > 0

In [11]:
df['have_children'] = df['have_children'].astype('int')
df['have_children'].value_counts()

have_children
0    33577
1     2698
Name: count, dtype: int64

In [12]:
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

def adjust_for_feb_29(year, month, day):
    if month == 2 and day == 29 and not is_leap_year(year):
        return (month, 28)  
    return (month, day)

df['arrival_month'] = df['arrival_month'].apply(lambda x: f'{int(x):02d}')
df['arrival_date'] = df['arrival_date'].apply(lambda x: f'{int(x):02d}')

df[['arrival_month', 'arrival_date']] = df.apply(
    lambda row: adjust_for_feb_29(row['arrival_year'], row['arrival_month'], row['arrival_date']), axis=1,
    result_type='expand'
)

df['arrival_date_complete'] = pd.to_datetime(
    df[['arrival_year', 'arrival_month', 'arrival_date']].astype(str).agg('-'.join, axis=1),
    format='%Y-%m-%d', errors='coerce'
)


df['arrival_date_complete'] = df['arrival_date_complete'].fillna(pd.to_datetime('2018-02-28'))

df['arrival_date_complete'] = pd.to_datetime(df['arrival_date_complete'])

df['wday'] = df['arrival_date_complete'].dt.day_name()

## 3. Feature selection and model preparation

In [13]:
# from df dropping 'booking_id','no_of_previous_cancellations','arrival_year','arrival_date','no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled','has_prev_cancellations', 'has_prev_bookings_not_cancelled','segment_days_week','arrival_date_complete', 'month_year', 'no_of_children', 'no_of_adults','price_per_adult', 'price_per_person'
df = df.drop(['booking_id','no_of_previous_cancellations','arrival_date','no_of_previous_cancellations', 'arrival_year',
              'no_of_previous_bookings_not_canceled','has_prev_cancellations', 'has_prev_bookings_not_cancelled',
               'price_per_adult','no_of_children' ,'total_people'], axis=1)

# change booking_status to 0 and 1
df['booking_cancelled_flag'] = df['booking_status'].replace({'canceled': 1, 'not_canceled': 0})

# drop booking_status
df = df.drop(['booking_status'], axis=1)

df.columns

  df['booking_cancelled_flag'] = df['booking_status'].replace({'canceled': 1, 'not_canceled': 0})


Index(['no_of_adults', 'no_of_weekend_nights', 'no_of_week_nights',
       'type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
       'lead_time', 'arrival_month', 'market_segment_type', 'repeated_guest',
       'avg_price_per_room', 'no_of_special_requests', 'price_per_person',
       'total_nights', 'have_children', 'arrival_date_complete', 'wday',
       'booking_cancelled_flag'],
      dtype='object')

have children change from bool to 1-0


In [14]:
# from arrival month in date to number
df['arrival_month'] = df['arrival_month'].astype('int')
df['arrival_month'].value_counts()

arrival_month
10    5317
9     4611
8     3813
6     3203
12    3021
11    2980
7     2920
4     2736
5     2598
3     2358
2     1704
1     1014
Name: count, dtype: int64

## 4. Train test split

In [15]:
categorical= ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'wday']
numerical = ['no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'repeated_guest',  'price_per_person',
        'avg_price_per_room', 'no_of_special_requests','total_nights', 'arrival_month', 
              'no_of_adults', 'have_children']

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1, stratify=df['booking_cancelled_flag'])
print(len(df_full_train), len(df_test))

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1, stratify=df_full_train['booking_cancelled_flag'])
print(len(df_train), len(df_val), len(df_test))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = df_train.booking_cancelled_flag.values
y_val = df_val.booking_cancelled_flag.values
y_test = df_test.booking_cancelled_flag.values
y_full_train = df_full_train.booking_cancelled_flag.values

del df_train['booking_cancelled_flag']
del df_val['booking_cancelled_flag']
del df_test['booking_cancelled_flag']
del df_full_train['booking_cancelled_flag']

29020 7255
21765 7255 7255


## Model

In [18]:
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Training function with SMOTE integration
def train_with_smote(df_train, y_train, params, categorical, numerical):
    # Convert to dictionary and transform using DictVectorizer
    train_dict = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    
    # Apply SMOTE to balance the dataset
    smote = SMOTE(random_state=42, sampling_strategy=1)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Create a DMatrix for training
    features = list(dv.get_feature_names_out())
    dtrain = xgb.DMatrix(X_resampled, label=y_resampled, feature_names=features)
    
    # Train the model
    model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=200, 
        evals=[(dtrain, 'train')], 
        verbose_eval=5, 
        early_stopping_rounds=5
    )
    
    return dv, model

# Prediction function
def predict(df, dv, model, categorical, numerical):
    # Transform input data using DictVectorizer
    data_dict = df[categorical + numerical].to_dict(orient='records')
    X = dv.transform(data_dict)
    features = list(dv.get_feature_names_out())  # Ensure it's a list
    dmatrix = xgb.DMatrix(X, feature_names=features)
    
    # Predict probabilities
    y_pred = model.predict(dmatrix)
    return y_pred

# XGBoost parameters
xgb_params = {
    'eta': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}


# Training the model with SMOTE
print('Training the final model with SMOTE...')
dv, model = train_with_smote(df_full_train, y_full_train, xgb_params, categorical, numerical)

# Validate the model
y_pred_val = predict(df_val, dv, model, categorical, numerical)
y_pred_binary_val = (y_pred_val >= 0.5).astype(int)

# Final training on the full training set with SMOTE
print('Training the final model on the full dataset with SMOTE...')
dv, model = train_with_smote(df_full_train, y_full_train, xgb_params, categorical, numerical)

# Test set evaluation
y_pred_test = predict(df_test, dv, model, categorical, numerical)
y_pred_binary_test = (y_pred_test >= 0.5).astype(int)

# Evaluate test performance
accuracy_test = accuracy_score(y_test, y_pred_binary_test)
precision_test = precision_score(y_test, y_pred_binary_test)
recall_test = recall_score(y_test, y_pred_binary_test)
f1_test = f1_score(y_test, y_pred_binary_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)

print(f'Test Accuracy: {accuracy_test * 100:.2f}%')
print(f'Test Precision: {precision_test * 100:.2f}%')
print(f'Test Recall: {recall_test * 100:.2f}%')
print(f'Test F1 Score: {f1_test * 100:.2f}%')
print(f'Test ROC AUC Score: {roc_auc_test * 100:.2f}%')

# Save the final model and DictVectorizer
output_file = 'xgboost_model_booking_cancellation_smote.bin'
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print(f'The model and DictVectorizer are saved to {output_file}')


Training the final model with SMOTE...




[0]	train-logloss:0.62734
[5]	train-logloss:0.42616
[10]	train-logloss:0.32635
[15]	train-logloss:0.26848
[20]	train-logloss:0.23389
[25]	train-logloss:0.21203
[30]	train-logloss:0.19692
[35]	train-logloss:0.18708
[40]	train-logloss:0.17930
[45]	train-logloss:0.17189
[50]	train-logloss:0.16521
[55]	train-logloss:0.15994
[60]	train-logloss:0.15572
[65]	train-logloss:0.15128
[70]	train-logloss:0.14753
[75]	train-logloss:0.14397
[80]	train-logloss:0.13919
[85]	train-logloss:0.13655
[90]	train-logloss:0.13294
[95]	train-logloss:0.13014
[100]	train-logloss:0.12716
[105]	train-logloss:0.12375
[110]	train-logloss:0.12012
[115]	train-logloss:0.11846
[120]	train-logloss:0.11514
[125]	train-logloss:0.11209
[130]	train-logloss:0.10959
[135]	train-logloss:0.10851
[140]	train-logloss:0.10455
[145]	train-logloss:0.10203
[150]	train-logloss:0.10028
[155]	train-logloss:0.09845
[160]	train-logloss:0.09553
[165]	train-logloss:0.09260
[170]	train-logloss:0.09041
[175]	train-logloss:0.08846
[180]	train-lo



[10]	train-logloss:0.32635
[15]	train-logloss:0.26848
[20]	train-logloss:0.23389
[25]	train-logloss:0.21203
[30]	train-logloss:0.19692
[35]	train-logloss:0.18708
[40]	train-logloss:0.17930
[45]	train-logloss:0.17189
[50]	train-logloss:0.16521
[55]	train-logloss:0.15994
[60]	train-logloss:0.15572
[65]	train-logloss:0.15128
[70]	train-logloss:0.14753
[75]	train-logloss:0.14397
[80]	train-logloss:0.13919
[85]	train-logloss:0.13655
[90]	train-logloss:0.13294
[95]	train-logloss:0.13014
[100]	train-logloss:0.12716
[105]	train-logloss:0.12375
[110]	train-logloss:0.12012
[115]	train-logloss:0.11846
[120]	train-logloss:0.11514
[125]	train-logloss:0.11209
[130]	train-logloss:0.10959
[135]	train-logloss:0.10851
[140]	train-logloss:0.10455
[145]	train-logloss:0.10203
[150]	train-logloss:0.10028
[155]	train-logloss:0.09845
[160]	train-logloss:0.09553
[165]	train-logloss:0.09260
[170]	train-logloss:0.09041
[175]	train-logloss:0.08846
[180]	train-logloss:0.08688
[185]	train-logloss:0.08480
[190]	trai