In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dataset

df = pd.read_csv('hotel_bookings_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,No Deposit,999.0,0,Transient,75.0,0,0,Check-Out,2/7/2015
1,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2/7/2015
2,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,3/7/2015
3,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,3/7/2015
4,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,0,No Deposit,999.0,0,Transient,107.0,0,0,Check-Out,3/7/2015


In [4]:
# Double check columns with missing values (already cleaned in Part 1 previously)

df.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces 

# Q4 - Developing a model to predict bookings cancellation

In [5]:
# Firstly, need to drop several attributes, for the following reason:
# 1 - Deemed insignificant/redundant (e.g., :year", "week", and "day" are dropped as "month" is already included), or 
# 2 - Could mislead the analysis (e.g., "agent" consists of numbers which might be interpreted to be in order/hierarchy when in actual, it is just a set agent code/id)

columns_to_drop = ['arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'assigned_room_type', 'agent', 'reservation_status', 'reservation_status_date']

In [6]:
# Create the dataframe of the targeted columns - y is a column of the attribute to be predicted (i.e., cancellation) while x consists of others (as inputs)

y = df['is_canceled']
x = df.drop(columns_to_drop, axis=1)

In [7]:
# Data partition - Use 80:20 as train:test proportion

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [8]:
x_train

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
97509,City Hotel,1,4,June,1,0,1,0.0,0,BB,...,0,4,A,0,No Deposit,0,Transient,65.00,0,0
92829,City Hotel,0,4,March,0,2,1,0.0,0,BB,...,0,1,A,0,No Deposit,0,Transient,80.00,0,2
79513,City Hotel,0,28,June,1,2,2,0.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,85.50,0,0
80799,City Hotel,0,108,July,2,2,2,0.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,72.25,0,0
19330,Resort Hotel,0,32,April,1,2,2,0.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,65.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,Resort Hotel,0,223,July,2,5,2,0.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,82.71,0,2
45891,City Hotel,1,168,July,2,4,2,0.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,90.95,0,0
42613,City Hotel,1,68,May,2,4,2,1.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,135.15,0,0
43567,City Hotel,1,32,May,0,2,3,0.0,0,BB,...,0,0,E,0,No Deposit,0,Transient,207.00,0,0


In [9]:
# Separate data into Categorical and Numerical group
categorical_att = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']
numerical_att = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']

# Set up Pipeline to define the 2 transformations required - encoding and scaling 
categorical_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])
numerical_pipeline = Pipeline([('scaler', RobustScaler())])

# Use ColumnTransformer so encoding is applied to categorical attributes and scaling is applied to numerical attributes 
preprocessor = ColumnTransformer([('categorical',categorical_pipeline,categorical_att),('numerical', numerical_pipeline,numerical_att)])

### Naive Bayes model

In [10]:
# Fit Naive Bayes model to the Pipeline set up

pipe_NB = Pipeline([("preprocessor", preprocessor), ("model", GaussianNB())])
pipe_NB.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [11]:
y_pred = pipe_NB.predict(x_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [12]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 1983, 11100],
       [  163,  7567]], dtype=int64)

In [13]:
acc_NB = accuracy_score(y_test, y_pred)
acc_NB

0.45884783548743574

### K-NN model

In [14]:
# Fit K-NN model to the Pipeline set up

pipe_KNN = Pipeline([("preprocessor", preprocessor), ("model", KNeighborsClassifier())])
pipe_KNN.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [15]:
y_pred = pipe_KNN.predict(x_test)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [16]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[11561,  1522],
       [ 1911,  5819]], dtype=int64)

In [17]:
acc_KNN = accuracy_score(y_test, y_pred)
acc_KNN

0.8350550136933648

### Decision Tree model

In [18]:
# Fit Decision Tree model to the Pipeline set up

pipe_DT = Pipeline([("preprocessor", preprocessor), ("model", DecisionTreeClassifier())])
pipe_DT.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [19]:
y_pred = pipe_DT.predict(x_test)
y_pred

array([0, 1, 1, ..., 1, 0, 1], dtype=int64)

In [20]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[11399,  1684],
       [ 1574,  6156]], dtype=int64)

In [21]:
acc_DT = accuracy_score(y_test, y_pred)
acc_DT

0.8434632201028204

### Random Forest model

In [22]:
# Fit Random Forest model to the Pipeline set up

pipe_RF = Pipeline([("preprocessor", preprocessor), ("model", RandomForestClassifier())])
pipe_RF.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [23]:
y_pred = pipe_RF.predict(x_test)
y_pred

array([0, 1, 1, ..., 1, 0, 1], dtype=int64)

In [24]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[12222,   861],
       [ 1603,  6127]], dtype=int64)

In [25]:
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

0.8816124537548647

### XGBoost model

In [26]:
# Fit XGBoost model to the Pipeline set up

pipe_XGB = Pipeline([("preprocessor", preprocessor), ("model", XGBClassifier())])
pipe_XGB.fit(x_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [27]:
y_pred = pipe_XGB.predict(x_test)
y_pred

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [28]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[11965,  1118],
       [ 1717,  6013]], dtype=int64)

In [29]:
acc_XGB = accuracy_score(y_test, y_pred)
acc_XGB

0.8637870561668188

In [30]:
# Compile and compare the accuracy scores for the 5 models developed

models = ['Naive Bayes', 'K-NN', 'Decision Tree', 'Random Forest', 'XGBoost']
accuracy = [acc_NB, acc_KNN, acc_DT, acc_RF, acc_XGB]

j = 0
for model in models:
    print(f'Model used: {model}, accuracy = {accuracy[j]}')
    j = j+1

Model used: Naive Bayes, accuracy = 0.45884783548743574
Model used: K-NN, accuracy = 0.8350550136933648
Model used: Decision Tree, accuracy = 0.8434632201028204
Model used: Random Forest, accuracy = 0.8816124537548647
Model used: XGBoost, accuracy = 0.8637870561668188


### Discussion section for Q4

# Q5 - Developing a model to predict the number of booking nights for any new booking

In [31]:
# Set x and y variables - y is a column of the attribute to be predicted while x consists of others (minus the ones to drop for similar reasons as Q4) as the inputs

columns_to_drop = ['is_canceled', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'assigned_room_type', 'agent', 'reservation_status', 'reservation_status_date']
y = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
x = df.drop(columns_to_drop, axis=1)

In [32]:
# Data partition 80:20 for train:test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [33]:
# Separate data into Categorical and Numerical group
categorical_att = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']
numerical_att = ['lead_time', 'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']

# Set up Pipeline to define the 2 transformations required - encoding and scaling 
categorical_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])
numerical_pipeline = Pipeline([('scaler', RobustScaler())])

# Use ColumnTransformer so encoding is applied to categorical attributes and scaling is applied to numerical attributes 
preprocessor = ColumnTransformer([('categorical',categorical_pipeline,categorical_att),('numerical', numerical_pipeline,numerical_att)])

### Random Forest model

In [34]:
# Fit Random Forest model to the Pipeline set up

pipe_RF = Pipeline([("preprocessor", preprocessor), ("model", RandomForestClassifier())])
pipe_RF.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [35]:
y_pred = pipe_RF.predict(x_test)
y_pred

array([2, 3, 4, ..., 1, 1, 2], dtype=int64)

In [36]:
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

0.6005381252102051

### XGBoost model

In [37]:
# Fit XGBoost model to the Pipeline set up

pipe_XGB = Pipeline([("preprocessor", preprocessor), ("model", XGBClassifier())])
pipe_XGB.fit(x_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['hotel',
                                                   'arrival_date_month', 'meal',
                                                   'country', 'market_segment',
                                                   'distribution_channel',
                                                   'reserved_room_type',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numerical',
                                                  

In [38]:
y_pred = pipe_XGB.predict(x_test)
y_pred

array([2, 3, 2, ..., 1, 1, 2], dtype=int64)

In [39]:
acc_XGB = accuracy_score(y_test, y_pred)
acc_XGB

0.5487916206217268

### Discussion section for Q5