In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading csv file.
hotel=pd.read_csv(r'../Datasets/hotel_bookings.csv')
hotel.head(1)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01


### Exploratory Data Analysis and Data Cleaning

In [3]:
chotel = hotel.copy()

In [4]:
chotel.shape

(119390, 32)

In [5]:
chotel.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [6]:
chotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [7]:
# Checking for all null values.
chotel.isna().sum().sort_values(ascending=False)

company                           112593
agent                              16340
country                              488
children                               4
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
hotel                                  0
previous_cancellations                 0
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
previous_bookings_not_canceled         0
is_repeated_guest                      0
is_canceled                            0
distribution_channel                   0
market_segment                         0
meal                                   0
babies                                 0
adults                                 0
stays_in_week_ni

The column company and agent have higher number of missing values.And these columns are not relevent for our analysis, so we are gonna delete them.

In [8]:
chotel=chotel.drop(['company','agent'],axis=1)
chotel.head(1)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01


The columns country and children have a small number of missing values.But they can be important factor for the analysis, so here we are just removing those rows where we have null values in these columns.  

In [9]:
chotel['country'].isnull().sum()

488

In [10]:
chotel['children'].isnull().sum()

4

In [11]:
chotel=chotel.dropna(axis=0).reset_index(drop=True)

In [12]:
chotel.head(1)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01


In [13]:
chotel.isna().sum().sort_values(ascending=False)

hotel                             0
is_canceled                       0
reservation_status                0
total_of_special_requests         0
required_car_parking_spaces       0
adr                               0
customer_type                     0
days_in_waiting_list              0
deposit_type                      0
booking_changes                   0
assigned_room_type                0
reserved_room_type                0
previous_bookings_not_canceled    0
previous_cancellations            0
is_repeated_guest                 0
distribution_channel              0
market_segment                    0
country                           0
meal                              0
babies                            0
children                          0
adults                            0
stays_in_week_nights              0
stays_in_weekend_nights           0
arrival_date_day_of_month         0
arrival_date_week_number          0
arrival_date_month                0
arrival_date_year           

In [14]:
# Checking where there is no guests.
chotel[chotel.adults + chotel.babies + chotel.children == 0].shape

(170, 30)

These rows does not make any sense, so we are gonna delete these rows.

In [15]:
chotel = chotel.drop(chotel[(chotel.adults + chotel.babies + 
                             chotel.children == 0)].index).reset_index(drop=True)

In [16]:
chotel.head(1)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01


In [17]:
# Change the reservation status date to date format

chotel['reservation_status_date'] = pd.to_datetime(chotel['reservation_status_date'])

In [18]:
chotel.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
is_canceled,118728.0,0.371757,0.483276,0.0,0.0,0.0,1.0,1.0
lead_time,118728.0,104.401312,106.915284,0.0,18.0,70.0,161.0,737.0
arrival_date_year,118728.0,2016.157596,0.707456,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,118728.0,27.1651,13.586362,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,118728.0,15.800965,8.780412,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,118728.0,0.928307,0.992725,0.0,0.0,1.0,2.0,16.0
stays_in_week_nights,118728.0,2.500918,1.889089,0.0,1.0,2.0,3.0,40.0
adults,118728.0,1.861052,0.574697,0.0,2.0,2.0,2.0,55.0
children,118728.0,0.104356,0.399439,0.0,0.0,0.0,0.0,10.0
babies,118728.0,0.007959,0.097449,0.0,0.0,0.0,0.0,10.0


In [19]:
chotel.corr().T

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
is_canceled,1.0,0.291602,0.016408,0.007664,-0.006002,-0.002223,0.024803,0.05613,0.004535,-0.032605,-0.084141,0.109909,-0.055502,-0.145136,0.054113,0.044885,-0.195012,-0.235923
lead_time,0.291602,1.0,0.040085,0.126875,0.002402,0.084327,0.165913,0.11502,-0.03858,-0.021231,-0.123867,0.085949,-0.071164,0.002126,0.169935,-0.068174,-0.115707,-0.09671
arrival_date_year,0.016408,0.040085,1.0,-0.540297,-0.000511,0.021832,0.03203,0.029663,0.054556,-0.013186,0.010064,-0.120006,0.029813,0.031669,-0.056676,0.198612,-0.012795,0.10898
arrival_date_week_number,0.007664,0.126875,-0.540297,1.0,0.066577,0.018086,0.015531,0.025956,0.005527,0.010058,-0.031413,0.035354,-0.020876,0.005978,0.022732,0.075699,0.00177,0.02581
arrival_date_day_of_month,-0.006002,0.002402,-0.000511,0.066577,1.0,-0.015739,-0.027723,-0.001884,0.01457,-0.000541,-0.00666,-0.027024,0.000116,0.011442,0.022546,0.030053,0.008158,0.003078
stays_in_weekend_nights,-0.002223,0.084327,0.021832,0.018086,-0.015739,1.0,0.4901,0.093397,0.04578,0.018523,-0.086587,-0.012998,-0.040727,0.049222,-0.05481,0.048701,-0.018099,0.072146
stays_in_week_nights,0.024803,0.165913,0.03203,0.015531,-0.027723,0.4901,1.0,0.095399,0.044723,0.020343,-0.09602,-0.014252,-0.047478,0.079335,-0.002159,0.065344,-0.024431,0.06736
adults,0.05613,0.11502,0.029663,0.025956,-0.001884,0.093397,0.095399,1.0,0.028603,0.017642,-0.141921,-0.007278,-0.105795,-0.04207,-0.008814,0.221413,0.016107,0.122418
children,0.004535,-0.03858,0.054556,0.005527,0.01457,0.04578,0.044723,0.028603,1.0,0.024101,-0.032602,-0.024775,-0.020368,0.050717,-0.033415,0.325228,0.057059,0.081811
babies,-0.032605,-0.021231,-0.013186,0.010058,-0.000541,0.018523,0.020343,0.017642,0.024101,1.0,-0.008823,-0.007496,-0.006306,0.085389,-0.010654,0.028455,0.036979,0.097654


# What percentage of bookings were cancelled ?

In [20]:
booking_info=pd.DataFrame(chotel.groupby('hotel')['is_canceled'].value_counts())
booking_info

Unnamed: 0_level_0,Unnamed: 1_level_0,is_canceled
hotel,is_canceled,Unnamed: 2_level_1
City Hotel,0,46082
City Hotel,1,33061
Resort Hotel,0,28508
Resort Hotel,1,11077


In [21]:
booking_info.columns=['no of bookings']

In [22]:
booking_info

Unnamed: 0_level_0,Unnamed: 1_level_0,no of bookings
hotel,is_canceled,Unnamed: 2_level_1
City Hotel,0,46082
City Hotel,1,33061
Resort Hotel,0,28508
Resort Hotel,1,11077


In [23]:
booking_info['percentage']=chotel.groupby('hotel')['is_canceled'].value_counts(normalize=True)*100

In [24]:
booking_info

Unnamed: 0_level_0,Unnamed: 1_level_0,no of bookings,percentage
hotel,is_canceled,Unnamed: 2_level_1,Unnamed: 3_level_1
City Hotel,0,46082,58.226249
City Hotel,1,33061,41.773751
Resort Hotel,0,28508,72.017178
Resort Hotel,1,11077,27.982822


In [25]:
pd.DataFrame(booking_info)

Unnamed: 0_level_0,Unnamed: 1_level_0,no of bookings,percentage
hotel,is_canceled,Unnamed: 2_level_1,Unnamed: 3_level_1
City Hotel,0,46082,58.226249
City Hotel,1,33061,41.773751
Resort Hotel,0,28508,72.017178
Resort Hotel,1,11077,27.982822


From the table we can say that 27.98% bookings are cancelled in Resort Hotel and 41.77% bookings are cancelled in City Hotel.

# Which type of hotel has more booking ?

In [26]:
preference = pd.DataFrame(chotel.hotel.value_counts())
preference

Unnamed: 0,hotel
City Hotel,79143
Resort Hotel,39585


In [27]:
preference['percentage']=chotel.hotel.value_counts(normalize=True)*100
preference['percentage']

City Hotel      66.659086
Resort Hotel    33.340914
Name: percentage, dtype: float64

In [28]:
preference.index.name='hotel'

In [29]:
preference

Unnamed: 0_level_0,hotel,percentage
hotel,Unnamed: 1_level_1,Unnamed: 2_level_1
City Hotel,79143,66.659086
Resort Hotel,39585,33.340914


In [30]:
preference.rename({'hotel':'no of bookings'},axis=1,inplace=True)
preference

Unnamed: 0_level_0,no of bookings,percentage
hotel,Unnamed: 1_level_1,Unnamed: 2_level_1
City Hotel,79143,66.659086
Resort Hotel,39585,33.340914


As we can see that, booking perentage of City Hotel is higher than that of Resort Hotel.

# Which month has the highest number of arrivals ?

In [31]:
confirmed_booking=chotel.loc[chotel.is_canceled==0]
confirmed_booking.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [32]:
people_arrival_count_df=pd.DataFrame(confirmed_booking['arrival_date_month'].value_counts())
people_arrival_count_df.head()

Unnamed: 0,arrival_date_month
August,8604
July,7880
May,7091
October,6854
March,6566


In [33]:
people_arrival_count_df.columns=['No of bookings']

In [34]:
people_arrival_count_df.head()

Unnamed: 0,No of bookings
August,8604
July,7880
May,7091
October,6854
March,6566


In [35]:
people_arrival_count_df.index.name='arrival_date_month'

In [36]:
people_arrival_count_df.head()

Unnamed: 0_level_0,No of bookings
arrival_date_month,Unnamed: 1_level_1
August,8604
July,7880
May,7091
October,6854
March,6566


In [38]:
people_arrival_count_df['booking_percentage'] =confirmed_booking['arrival_date_month'].value_counts(normalize=True)*100

In [39]:
people_arrival_count_df

Unnamed: 0_level_0,No of bookings,booking_percentage
arrival_date_month,Unnamed: 1_level_1,Unnamed: 2_level_1
August,8604,11.535058
July,7880,10.564419
May,7091,9.506636
October,6854,9.188899
March,6566,8.802789
April,6528,8.751843
June,6384,8.558788
September,6360,8.526612
February,5304,7.110873
November,4611,6.181794


From the above we see that, August has the highest number of arrivals and January has the lowest number of arrivals.

# Which year has highest number of arrivals ?

In [41]:
yearly_bookings=pd.DataFrame(confirmed_booking.groupby('arrival_date_year')['hotel'].value_counts())
yearly_bookings

Unnamed: 0_level_0,Unnamed: 1_level_0,hotel
arrival_date_year,hotel,Unnamed: 2_level_1
2015,City Hotel,7654
2015,Resort Hotel,6071
2016,City Hotel,22662
2016,Resort Hotel,13389
2017,City Hotel,15766
2017,Resort Hotel,9048


In [42]:
yearly_bookings.columns=['No of bookings']

In [43]:
yearly_bookings

Unnamed: 0_level_0,Unnamed: 1_level_0,No of bookings
arrival_date_year,hotel,Unnamed: 2_level_1
2015,City Hotel,7654
2015,Resort Hotel,6071
2016,City Hotel,22662
2016,Resort Hotel,13389
2017,City Hotel,15766
2017,Resort Hotel,9048


From the above we see that, most number of people arrived in year 2016.

# Top 10 countries from which people are coming the most ? 

In [44]:
# Top 10 countries people are comming the most in number.
country=pd.DataFrame(confirmed_booking.country.value_counts())
country

Unnamed: 0,country
PRT,20977
GBR,9668
FRA,8468
ESP,6383
DEU,6067
...,...
BHR,1
DJI,1
MLI,1
NPL,1


In [45]:
country.columns=['no of bookings']

In [46]:
country.head(10)

Unnamed: 0,no of bookings
PRT,20977
GBR,9668
FRA,8468
ESP,6383
DEU,6067
IRL,2542
ITA,2428
BEL,1868
NLD,1716
USA,1592


From the above we see that, Portugal has the highest number of arrivals.

# Market Segment wise bookings

In [47]:
market_df=pd.DataFrame(confirmed_booking.market_segment.value_counts())
market_df

Unnamed: 0,market_segment
Online TA,35599
Offline TA/TO,15854
Direct,10504
Groups,7692
Corporate,4121
Complementary,637
Aviation,183


In [48]:
market_df.index.name='market_segment'

In [49]:
market_df

Unnamed: 0_level_0,market_segment
market_segment,Unnamed: 1_level_1
Online TA,35599
Offline TA/TO,15854
Direct,10504
Groups,7692
Corporate,4121
Complementary,637
Aviation,183


In [50]:
market_df.columns=['no of bookings']

In [51]:
market_df

Unnamed: 0_level_0,no of bookings
market_segment,Unnamed: 1_level_1
Online TA,35599
Offline TA/TO,15854
Direct,10504
Groups,7692
Corporate,4121
Complementary,637
Aviation,183


From the above we see that, Online TA is the most frequent maket segment.

# How many nights mostly the guests stay in the hotel ?

In [68]:
stay=(chotel['stays_in_weekend_nights'] + chotel['stays_in_week_nights']).value_counts()
stay.head(10)

3.0    26745
2.0    25024
4.0    18832
1.0    16111
5.0    12768
6.0     6091
7.0     5299
0.0     3147
8.0     1450
9.0     1164
dtype: int64

Most of the guests stay for 1,2 and 3 nights.More than 60% of guests come under these three options.

# Which is the most reserved room type ?

In [67]:
chotel.reserved_room_type.value_counts().head()

A    85480
D    19151
E     6481
F     2887
G     2081
Name: reserved_room_type, dtype: int64

# Which is the most common customer type ?

In [66]:
chotel.customer_type.value_counts()


Transient          89046
Transient-Party    25042
Contract            4072
Group                568
Name: customer_type, dtype: int64

# Which is the most popular meal package ?

In [65]:
chotel.meal.value_counts().head()


BB           91789
HB           14429
SC           10547
Undefined     1165
FB             798
Name: meal, dtype: int64

# CONCLUSION

* Majority of the hotels are City Hotel, so we have to spend more money on these types of hotels.

* Most number of bookings occured in summer months, so we have to target these summer months to increase the hotel business.

* Most of the guests are from western nations, so we have to spent more time and money on those nations so that the hotel business will grow more on those nations.

* Most of the guests are Transient, so we can target these types of guests to increase the number of booking.