In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plot

In [2]:
df = pd.read_csv('Hotel_Reservations.csv')

In [3]:
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


---
### Exploratory Data Analysis 

In [4]:
df.shape

(36275, 19)

##### Observation: We have too many features and few samples, taking into account the number of features.

In [5]:
df.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'avg_price_per_room', 'no_of_special_requests', 'booking_status'],
      dtype='object')

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_adults,36275.0,1.844962,0.518715,0.0,2.0,2.0,2.0,4.0
no_of_children,36275.0,0.105279,0.402648,0.0,0.0,0.0,0.0,10.0
no_of_weekend_nights,36275.0,0.810724,0.870644,0.0,0.0,1.0,2.0,7.0
no_of_week_nights,36275.0,2.2043,1.410905,0.0,1.0,2.0,3.0,17.0
required_car_parking_space,36275.0,0.030986,0.173281,0.0,0.0,0.0,0.0,1.0
lead_time,36275.0,85.232557,85.930817,0.0,17.0,57.0,126.0,443.0
arrival_year,36275.0,2017.820427,0.383836,2017.0,2018.0,2018.0,2018.0,2018.0
arrival_month,36275.0,7.423653,3.069894,1.0,5.0,8.0,10.0,12.0
arrival_date,36275.0,15.596995,8.740447,1.0,8.0,16.0,23.0,31.0
repeated_guest,36275.0,0.025637,0.158053,0.0,0.0,0.0,0.0,1.0


In [7]:
df.isnull().sum()

Booking_ID                              0
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64

In [8]:
room_type = pd.get_dummies(df['room_type_reserved'])
room_type.sum()

Room_Type 1    28130
Room_Type 2      692
Room_Type 3        7
Room_Type 4     6057
Room_Type 5      265
Room_Type 6      966
Room_Type 7      158
dtype: int64

In [9]:
booking_id = pd.get_dummies(df['Booking_ID'])
booking_id.sum().head()

INN00001    1
INN00002    1
INN00003    1
INN00004    1
INN00005    1
dtype: int64

In [10]:
type_of_meal = pd.get_dummies(df['type_of_meal_plan'])
type_of_meal.sum()

Meal Plan 1     27835
Meal Plan 2      3305
Meal Plan 3         5
Not Selected     5130
dtype: int64

In [12]:
segment_type = pd.get_dummies(df['market_segment_type'])
segment_type.sum()

Aviation           125
Complementary      391
Corporate         2017
Offline          10528
Online           23214
dtype: int64

In [13]:
df.drop(['Booking_ID', 'type_of_meal_plan', 'room_type_reserved', 'market_segment_type'], axis=1, inplace=True)

---
### Training the model 

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = df.drop('booking_status', axis=1)
y = df['booking_status']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [17]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [18]:
SGD_model = SGDClassifier()

---
### Fitting the model

In [19]:
SGD_model.fit(X_train, y_train)

##### When fitting the model, after traning it without the followings parameters: max_iter=1000, tol=1e-3. It displayed a ValueError, and the root cause is:  
    704     warnings.warn(
    705         "Maximum number of iteration reached before "
    706         "convergence. Consider increasing max_iter to "
    707         "improve the fit.",
    708         ConvergenceWarning,
    709     )

In [20]:
SGD_predictions = SGD_model.predict(X_test)

In [21]:
confusion_matrix(y_test, SGD_predictions)

array([[  21, 3496],
       [   0, 7366]])

In [22]:
accuracy_score(y_test, SGD_predictions)

0.6787650464026463

In [23]:
accuracy_score(y_test, SGD_predictions, normalize=False)

7387

##### Testing with different parameters 

In [24]:
SGD_model2 = SGDClassifier(loss='modified_huber', alpha=0.001)

In [25]:
SGD_model2.fit(X_train, y_train)

In [26]:
SGD_predictions2 = SGD_model2.predict(X_test)

In [27]:
confusion_matrix(y_test, SGD_predictions2)

array([[1898, 1619],
       [ 926, 6440]])

In [28]:
accuracy_score(y_test, SGD_predictions2)

0.7661490397868235

In [29]:
accuracy_score(y_test, SGD_predictions2, normalize=False)

8338