# Welcome to our Analysis of Hotel Reservations

In [1]:
# Let's start by importing any necessary packages

import pandas as pd

# import packages for cleaning and organizing data
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# import model packages
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
# import need metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score
from sklearn import metrics
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Now lets load the data 
hotelDF = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')

# here is the url to look at the data explanation
# https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11

In [4]:
print(hotelDF.columns)
hotelDF.head()

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [None]:
# What are some possible opportunities for feature engineering with our data

In [None]:
# is the reserved room type the same as the assigned room type?


In [None]:
# Now that we have our data, lets go ahead and do some exploratory analysis

In [31]:
# mean and standard deviation for all numeric variables
myColumn = 'required_car_parking_spaces'
print(round(hotelDF[myColumn].mean(), 2))
round(hotelDF[myColumn].std(), 2)

0.06


0.25

In [47]:
myColumn = 'reservation_status'
hotelDF[myColumn].value_counts().head(15)

Check-Out    75166
Canceled     43017
No-Show       1207
Name: reservation_status, dtype: int64

In [13]:
hotelDF['deposit_type'].unique()

array(['No Deposit', 'Refundable', 'Non Refund'], dtype=object)

In [14]:
hotelDF['customer_type'].unique()

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [27]:
hotelDF['reservation_status'].unique()

array(['Check-Out', 'Canceled', 'No-Show'], dtype=object)

In [24]:
# choose columns that I want to use
myDF= hotelDF[['hotel', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type','days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status']]

In [25]:
myDF.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,Resort Hotel,342,2015,July,27,1,0,0,2,0.0,...,C,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out
1,Resort Hotel,737,2015,July,27,1,0,0,2,0.0,...,C,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out
2,Resort Hotel,7,2015,July,27,1,0,1,1,0.0,...,A,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out
3,Resort Hotel,13,2015,July,27,1,0,1,1,0.0,...,A,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out
4,Resort Hotel,14,2015,July,27,1,0,2,2,0.0,...,A,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out


In [26]:
# remove NA values
print(len(myDF.index))
myDF.dropna(axis=0, inplace = True)
print(len(myDF.index))

119390
118898


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
# turn target column into a integer
# myY = (myDF['reservation_status']  == 'Check-Out').astype(int)
myY = myDF['reservation_status']
myY.head()

0    Check-Out
1    Check-Out
2    Check-Out
3    Check-Out
4    Check-Out
Name: reservation_status, dtype: object

In [33]:
# hot-shot columns -- get dummy variables for:
# hotel
# arrival_date_month
# reserved_room_type 
# assigned_room_type
# deposit_type
# customer_type

myX = myDF.drop(['reservation_status'], axis = 1)
myX = pd.get_dummies(myX, drop_first=True)
myX.head()



Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,342,2015,27,1,0,0,2,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,737,2015,27,1,0,0,2,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,7,2015,27,1,0,1,1,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,13,2015,27,1,0,1,1,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,14,2015,27,1,0,2,2,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [29]:
myY.unique()

array([1, 0])

In [None]:
# my explanatory variable is reservation status
myY = (hotelDF[''])

In [34]:
# split into train test split
X_train, X_test, y_train, y_test = train_test_split(myX, myY, stratify=myY, test_size = 0.25, random_state=801)

In [35]:
myLR2 = LogisticRegression(random_state=10)
myLR2.fit(X_test, y_test)
myLR2.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7398149705634988

In [36]:
# get predictions
LR2_yhat = myLR2.predict(X_test)
# confusion matrix
confusion_matrix(y_test, LR2_yhat)

array([[ 4962,  5750,    26],
       [ 1468, 17022,   196],
       [   13,   281,     7]], dtype=int64)