# Hotel Booking Cancellations - EDA & Preprocessing

In [64]:
import pandas as pd
import seaborn as sns

# Set plot style
sns.set_theme(style='whitegrid')

# Load the dataset
DATA_PATH = '../data/hotel_bookings.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    display(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please ensure you have downloaded the dataset.")

Dataset loaded successfully. Shape: (119390, 32)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [66]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [67]:
# Null values

null_values = df.isnull().sum()
print("Columns with null values:")
print(null_values[null_values > 0])

Columns with null values:
children         4
country        488
agent        16340
company     112593
dtype: int64


Meaning of columns with null values:
- children: Number of children. => Null values may indicate no children. => replace with 0?
- country: Country of origin (ISO format e.g., "POL", "ESP"). => ?
- agent: ID of the travel agent who made the booking. => Null values may indicate direct bookings. => replace with 0? or add new category 'direct'?
- company: ID of the company/entity that made the booking or responsible for paying the booking. => Null values may indicate individual bookings. => replace with 0? or add new category 'individual'?

Additionally based on dataset description on Kaggle
- meal: Undefined and SC is the same category (no meal package) => worth to change to the same value

In [68]:
fill_values = {
    'children': 0,
    'country': 'Unknown',
    'agent': 0,
    'company': 0
}
df = df.fillna(fill_values)

display(df['meal'].value_counts())
df['meal'] = df['meal'].replace('Undefined', 'SC')

meal
BB           92310
HB           14463
SC           10650
Undefined     1169
FB             798
Name: count, dtype: int64

In [69]:
# Impossible values analysis
no_people_reservation = df[(df['adults'] + df['children'] + df['babies']) == 0]
print(f"Number of reservations with no people: {len(no_people_reservation)}")

# probably these are errors in data, and these rows should be removed
df = df[(df['adults'] + df['children'] + df['babies']) > 0]
print(f"New dataset shape after removing no-people reservations: {df.shape}")

Number of reservations with no people: 180
New dataset shape after removing no-people reservations: (119210, 32)


In [70]:
# Find duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
print(f"Percentage of duplicate rows: {duplicates / len(df) * 100:.2f}%")

Number of duplicate rows: 31980
Percentage of duplicate rows: 26.83%


In [71]:
# Drop duplicates? - might change the prediction depending on the model used, so maybe decide on each model training separately - there is no booking_id, so duplicates might just have the same features but be different bookings
# df = df.drop_duplicates()

In [72]:
# Data leakage analysis
df_cancelled = df[df['is_canceled'] == 1]

# find if out of these bookings, there are any with 'reserved_room_type' different than 'assigned_room_type' => this might be a data leakage, because the room rarely changes for cancelled bookings
leakage_cases = df_cancelled[df_cancelled['reserved_room_type'] != df_cancelled['assigned_room_type']]
print(f"Number of changed rooms out of cancelled bookings: {len(leakage_cases)}")
print(f"Percentage of changed rooms out of cancelled bookings: {len(leakage_cases) / len(df_cancelled) * 100:.2f}%")
print(f"Percentage of changed rooms out of all bookings: {len(df[df['reserved_room_type'] != df['assigned_room_type']]) / len(df) * 100:.2f}%")
print("-----")

# reservation_status and reservation_status_date data leakage analysis
# These columns directly indicate whether a booking was cancelled or not, so they should be removed from the dataset to prevent data leakage.
df = df.drop(columns=['reservation_status', 'reservation_status_date'])

Number of changed rooms out of cancelled bookings: 801
Percentage of changed rooms out of cancelled bookings: 1.81%
Percentage of changed rooms out of all bookings: 12.41%
-----


In [73]:
print("Data shape after preprocessing:", df.shape)

Data shape after preprocessing: (119210, 30)


In [74]:
# TODO: make data preprocessing an elegant pipeline or something. Maybe stored preprocessed data in a new csv file?