# Hotel reservations cancellations prediction

In [20]:
# Standard library imports
import os
import zipfile

# Numerical and data manipulation libraries
import numpy as np
import pandas as pd

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# Metrics for model evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)

# Imbalanced data handling
from imblearn.over_sampling import SMOTE, RandomOverSampler


In [21]:
df = pd.read_csv('hotel_reservations.csv')


<!-- README_INCLUDE -->
The database used for the model consists in a set of bookings from a hotel with a unique id called ¨boooking_id¨ from 2017 and 2018.

## 1. Data Preparation

Convert all column values into lower case and replace spaces

In [22]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [23]:
df['total_people']  = df['no_of_adults'] + df['no_of_children']
df['total_people'] .value_counts()

total_people
2     23942
1      7552
3      3851
4       912
5        15
12        1
10        1
11        1
Name: count, dtype: int64

In [24]:
df['price_per_adult'] = df['avg_price_per_room'] // df['no_of_adults']

In [25]:
df['price_per_person']  = df['avg_price_per_room'] // df['total_people']

In [26]:
df['has_prev_cancellations'] = df['no_of_previous_cancellations'] > 0

In [27]:
df['has_prev_bookings_not_cancelled'] = df['no_of_previous_bookings_not_canceled'] > 0

In [28]:
df['total_nights'] = df['no_of_weekend_nights'] + df['no_of_week_nights']

In [29]:
df['have_children'] = df['no_of_children'] > 0

In [30]:
df['have_children'] = df['have_children'].astype('int')
df['have_children'].value_counts()

have_children
0    33577
1     2698
Name: count, dtype: int64

In [31]:
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

def adjust_for_feb_29(year, month, day):
    if month == 2 and day == 29 and not is_leap_year(year):
        return (month, 28)  
    return (month, day)

df['arrival_month'] = df['arrival_month'].apply(lambda x: f'{int(x):02d}')
df['arrival_date'] = df['arrival_date'].apply(lambda x: f'{int(x):02d}')

df[['arrival_month', 'arrival_date']] = df.apply(
    lambda row: adjust_for_feb_29(row['arrival_year'], row['arrival_month'], row['arrival_date']), axis=1,
    result_type='expand'
)

df['arrival_date_complete'] = pd.to_datetime(
    df[['arrival_year', 'arrival_month', 'arrival_date']].astype(str).agg('-'.join, axis=1),
    format='%Y-%m-%d', errors='coerce'
)


df['arrival_date_complete'] = df['arrival_date_complete'].fillna(pd.to_datetime('2018-02-28'))

df['arrival_date_complete'] = pd.to_datetime(df['arrival_date_complete'])

df['wday'] = df['arrival_date_complete'].dt.day_name()

## 3. Feature selection and model preparation

In [32]:
# from df dropping 'booking_id','no_of_previous_cancellations','arrival_year','arrival_date','no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled','has_prev_cancellations', 'has_prev_bookings_not_cancelled','segment_days_week','arrival_date_complete', 'month_year', 'no_of_children', 'no_of_adults','price_per_adult', 'price_per_person'
df = df.drop(['booking_id','no_of_previous_cancellations','arrival_date','no_of_previous_cancellations', 'arrival_year',
              'no_of_previous_bookings_not_canceled','has_prev_cancellations', 'has_prev_bookings_not_cancelled',
               'price_per_adult','no_of_children' ,'total_people'], axis=1)

# change booking_status to 0 and 1
df['booking_cancelled_flag'] = df['booking_status'].replace({'canceled': 1, 'not_canceled': 0})

# drop booking_status
df = df.drop(['booking_status'], axis=1)

df.columns

  df['booking_cancelled_flag'] = df['booking_status'].replace({'canceled': 1, 'not_canceled': 0})


Index(['no_of_adults', 'no_of_weekend_nights', 'no_of_week_nights',
       'type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
       'lead_time', 'arrival_month', 'market_segment_type', 'repeated_guest',
       'avg_price_per_room', 'no_of_special_requests', 'price_per_person',
       'total_nights', 'have_children', 'arrival_date_complete', 'wday',
       'booking_cancelled_flag'],
      dtype='object')

have children change from bool to 1-0


In [33]:
# from arrival month in date to number
df['arrival_month'] = df['arrival_month'].astype('int')
df['arrival_month'].value_counts()

arrival_month
10    5317
9     4611
8     3813
6     3203
12    3021
11    2980
7     2920
4     2736
5     2598
3     2358
2     1704
1     1014
Name: count, dtype: int64

## 4. Train test split

In [34]:
categorical= ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'wday']
numerical = ['no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'repeated_guest',  'price_per_person',
        'avg_price_per_room', 'no_of_special_requests','total_nights', 'arrival_month', 
              'no_of_adults', 'have_children']

In [35]:
# df = df.reset_index(drop=True, inplace=True)

In [36]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1, stratify=df['booking_cancelled_flag'])
print(len(df_full_train), len(df_test))

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1, stratify=df_full_train['booking_cancelled_flag'])
print(len(df_train), len(df_val), len(df_test))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = df_train.booking_cancelled_flag.values
y_val = df_val.booking_cancelled_flag.values
y_test = df_test.booking_cancelled_flag.values
y_full_train = df_full_train.booking_cancelled_flag.values

del df_train['booking_cancelled_flag']
del df_val['booking_cancelled_flag']
del df_test['booking_cancelled_flag']
del df_full_train['booking_cancelled_flag']

29020 7255
21765 7255 7255


In [42]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict) # fit_transform is used to learn the transformation and then apply it to the data

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)

full_train_dict = df_full_train[categorical + numerical].to_dict(orient='records')
X_full_train = dv.transform(full_train_dict)


## 6. Balancing the dataset

In [43]:
# Combine features and target into a single DataFrame for undersampling
X_train_df = pd.DataFrame(X_train, columns=dv.get_feature_names_out())
train_data = pd.concat([X_train_df, pd.Series(y_train, name='booking_cancelled_flag')], axis=1)

Trying to undersample to make it balanced by reducing the number of not cancelled bookings

### SMOTE (Synthetic Minority Oversampling Technique): all over sample

Creates synthetic samples by interpolating between existing minority class samples.
Efficiency: Effective at generating diverse synthetic samples.
Works well with numerical data.
Downside: Can introduce noise or outliers if the data has overlapping classes.


In [44]:
smote1 = SMOTE(random_state=42, sampling_strategy=1)
X_resampled_smote1, y_resampled_smote1 = smote1.fit_resample(train_data.drop(columns='booking_cancelled_flag'), train_data['booking_cancelled_flag'])




## 7. Model

In [45]:
features = list(dv.get_feature_names_out())

X_train_df = pd.DataFrame(X_full_train, columns=dv.get_feature_names_out())
train_data = pd.concat([X_train_df, pd.Series(y_full_train, name='booking_cancelled_flag')], axis=1)

smote1 = SMOTE(random_state=42, sampling_strategy=1)
X_resampled_smote1, y_resampled_smote1 = smote1.fit_resample(train_data.drop(columns='booking_cancelled_flag'), train_data['booking_cancelled_flag'])

# Train the XGBoost model on the resampled data
dtrain = xgb.DMatrix(X_resampled_smote1, label=y_resampled_smote1, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

best_params = {
    'eta': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model = xgb.train(best_params, dtrain, num_boost_round=200)

y_pred_p_test = model.predict(dtest)

y_pred_test = (y_pred_p_test >= 0.5).astype(int)

accuracy_xgb_test = accuracy_score(y_test, y_pred_test)
precision_xgb_test = precision_score(y_test, y_pred_test)
recall_xgb_test = recall_score(y_test, y_pred_test)
f1_xgb_test = f1_score(y_test, y_pred_test)
roc_auc_xgb_test = roc_auc_score(y_test, y_pred_p_test)

print(f'Accuracy: {accuracy_xgb_test * 100:.2f}%')
print(f'Precision: {precision_xgb_test * 100:.2f}%')
print(f'Recall: {recall_xgb_test * 100:.2f}%')
print(f'F1 Score: {f1_xgb_test * 100:.2f}%')
print(f'ROC AUC Score: {roc_auc_xgb_test * 100:.2f}%')




Accuracy: 89.81%
Precision: 86.34%
Recall: 81.87%
F1 Score: 84.04%
ROC AUC Score: 95.58%
