# Data Preparation for Machine Learning

The purpose of this workbook is to take the cleaned dataframe and prepare the data for machine learning applications. 

In [44]:
# Import libraries

import numpy as np
import pandas as pd
import hvplot.pandas
from collections import Counter
from pathlib import Path
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 

import warnings
warnings.filterwarnings('ignore')

In [45]:
# Set the random seed for reproducibility
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [46]:
# Load the cleaned hotel booking data into the workbook and convert to pandas Dataframe
csv_path = Path('../Resources/cleaned_hotel_bookings.csv')

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0.1,Unnamed: 0,hotel,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,meal,market_segment,reserved_room_type,deposit_type,customer_type,adr,total_of_special_requests,arival_date
0,2,Resort Hotel,7,27,0,1,1,BB,Direct,A,No Deposit,Transient,75.0,0,2015-07-01
1,3,Resort Hotel,13,27,0,1,1,BB,Corporate,A,No Deposit,Transient,75.0,0,2015-07-01
2,4,Resort Hotel,14,27,0,2,2,BB,Online TA,A,No Deposit,Transient,98.0,1,2015-07-01
3,5,Resort Hotel,14,27,0,2,2,BB,Online TA,A,No Deposit,Transient,98.0,1,2015-07-01
4,6,Resort Hotel,0,27,0,2,2,BB,Direct,C,No Deposit,Transient,107.0,0,2015-07-01


In [30]:
# Drop unamed row
df = df.drop(columns = ['Unnamed: 0'])


In [31]:
# Confirm there are no null values in the dataset
df.isnull().values.any()

False

In [32]:
df.dtypes

hotel                         object
lead_time                      int64
arrival_date_week_number       int64
stays_in_weekend_nights        int64
stays_in_week_nights           int64
adults                         int64
meal                          object
market_segment                object
reserved_room_type            object
deposit_type                  object
customer_type                 object
adr                          float64
total_of_special_requests      int64
arival_date                   object
dtype: object

In [33]:
# # get dummies
# dummy_columns = ['meal', 'reserved_room_type', 'market_segment', 'deposit_type', 'customer_type']
# df = pd.get_dummies(df, columns = dummy_columns)
# df.head()

Unnamed: 0,hotel,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,adr,total_of_special_requests,arival_date,meal_BB,...,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,Resort Hotel,7,27,0,1,1,75.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0
1,Resort Hotel,13,27,0,1,1,75.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0
2,Resort Hotel,14,27,0,2,2,98.0,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
3,Resort Hotel,14,27,0,2,2,98.0,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
4,Resort Hotel,0,27,0,2,2,107.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0


In [42]:
# # Plot the data
# df.hist(figsize = (12,10))

In [43]:
# Correlation between numerical values
df.corr()

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,adr,total_of_special_requests,meal_BB,meal_FB,meal_HB,...,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
lead_time,1.0,0.129116,0.078502,0.159661,0.122594,-0.093692,-0.095516,-0.037711,0.010186,0.136768,...,0.346359,0.146375,-0.195362,-0.380256,0.380227,0.016193,0.06771,-0.037709,-0.171192,0.157719
arrival_date_week_number,0.129116,1.0,0.019509,0.016869,0.027654,0.090237,0.027259,-0.000506,0.022462,0.037761,...,0.002352,0.068357,-0.039152,-0.006811,0.008736,-0.016915,0.091698,0.0101,-0.079281,0.041351
stays_in_weekend_nights,0.078502,0.019509,1.0,0.487456,0.105512,0.037442,0.076119,-0.064984,0.018503,0.107985,...,-0.068002,0.064465,0.057508,0.118477,-0.119143,0.000994,0.102751,-0.009557,0.024146,-0.070204
stays_in_week_nights,0.159661,0.016869,0.487456,1.0,0.106954,0.049894,0.071444,-0.059529,0.015829,0.124435,...,-0.076207,0.094485,0.037424,0.084539,-0.085603,0.005964,0.135117,-0.01897,0.012347,-0.070638
adults,0.122594,0.027654,0.105512,0.106954,1.0,0.284164,0.152696,-0.04939,0.01744,0.056503,...,-0.051406,-0.041599,0.19404,0.038583,-0.039148,0.003435,0.023931,-0.019474,0.116715,-0.131483
adr,-0.093692,0.090237,0.037442,0.049894,0.284164,1.0,0.191345,-0.097887,0.013752,0.147178,...,-0.219647,-0.166107,0.295171,0.116737,-0.114897,-0.021332,-0.063804,-0.021543,0.193197,-0.172977
total_of_special_requests,-0.095516,0.027259,0.076119,0.071444,0.152696,0.191345,1.0,-0.016876,-0.036407,-0.016918,...,-0.273049,-0.193629,0.397077,0.272386,-0.271426,-0.019996,0.037293,0.007849,0.129543,-0.155661
meal_BB,-0.037711,-0.000506,-0.064984,-0.059529,-0.04939,-0.097887,-0.016876,1.0,-0.151062,-0.686807,...,0.004386,-0.045867,-0.056314,-0.072908,0.071438,0.016189,0.012479,0.01553,0.062842,-0.074944
meal_FB,0.010186,0.022462,0.018503,0.015829,0.01744,0.013752,-0.036407,-0.151062,1.0,-0.030472,...,0.098726,0.002197,-0.069422,-0.061432,0.062059,-0.003029,-0.01262,-0.004024,-0.012485,0.0196
meal_HB,0.136768,0.037761,0.107985,0.124435,0.056503,0.147178,-0.016918,-0.686807,-0.030472,1.0,...,0.049888,0.174744,-0.133198,0.00659,-0.005317,-0.011651,0.01696,-0.012616,-0.168483,0.173454


In [36]:
# Split data into two dataframes based on 'City Hotel' and 'Resort Hotels'

def hotel_df(df, hotel):
    rslt_df = df.loc[df['hotel'].isin(hotel)]
    return rslt_df

City_hotel = hotel_df(df, ['City Hotel'])

Resort_hotel = hotel_df(df, ['Resort Hotel'])


In [37]:
City_hotel.head()

Unnamed: 0,hotel,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,adr,total_of_special_requests,arival_date,meal_BB,...,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
39116,City Hotel,88,27,0,4,2,76.5,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
39117,City Hotel,65,27,0,4,1,68.0,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
39118,City Hotel,92,27,2,4,2,76.5,2,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
39119,City Hotel,100,27,0,2,2,76.5,1,2015-07-02,1,...,0,0,1,1,0,0,0,0,1,0
39120,City Hotel,79,27,0,3,2,76.5,1,2015-07-02,1,...,0,0,1,1,0,0,0,0,1,0


In [38]:
Resort_hotel.head()

Unnamed: 0,hotel,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,adr,total_of_special_requests,arival_date,meal_BB,...,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,Resort Hotel,7,27,0,1,1,75.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0
1,Resort Hotel,13,27,0,1,1,75.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0
2,Resort Hotel,14,27,0,2,2,98.0,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
3,Resort Hotel,14,27,0,2,2,98.0,1,2015-07-01,1,...,0,0,1,1,0,0,0,0,1,0
4,Resort Hotel,0,27,0,2,2,107.0,0,2015-07-01,1,...,0,0,0,1,0,0,0,0,1,0


In [39]:
print(Resort_hotel.shape)
print(City_hotel.shape)

(39116, 37)
(77893, 37)


In [40]:
print(Resort_hotel.dtypes)
print(City_hotel.dtypes)

hotel                             object
lead_time                          int64
arrival_date_week_number           int64
stays_in_weekend_nights            int64
stays_in_week_nights               int64
adults                             int64
adr                              float64
total_of_special_requests          int64
arival_date                       object
meal_BB                            uint8
meal_FB                            uint8
meal_HB                            uint8
meal_SC                            uint8
meal_Undefined                     uint8
reserved_room_type_A               uint8
reserved_room_type_B               uint8
reserved_room_type_C               uint8
reserved_room_type_D               uint8
reserved_room_type_E               uint8
reserved_room_type_F               uint8
reserved_room_type_G               uint8
reserved_room_type_H               uint8
reserved_room_type_L               uint8
market_segment_Aviation            uint8
market_segment_C

In [41]:
Resort_hotel.to_csv('../Resources/Resort_Hotel_Prep.csv')
City_hotel.to_csv('../Resources/City_hotel_Prep.csv')
