# Data Preparation for Machine Learning

The purpose of this workbook is to take the cleaned dataframe and prepare the data for machine learning applications. 

In [31]:
# Import libraries

import numpy as np
import pandas as pd
import hvplot.pandas
from collections import Counter
from pathlib import Path
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 

import warnings
warnings.filterwarnings('ignore')

In [32]:
# Set the random seed for reproducibility
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [33]:
# Load the cleaned hotel booking data into the workbook and convert to pandas Dataframe
csv_path = Path('../Resources/cleaned_hotel_bookings.csv')

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0.1,Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,meal,country,market_segment,reserved_room_type,customer_type,adr,total_of_special_requests,date
0,2,Resort Hotel,7,2015,7,27,1,0,1,1,BB,GBR,Direct,A,Transient,75.0,0,2015-07-01
1,4,Resort Hotel,14,2015,7,27,1,0,2,2,BB,GBR,Online TA,A,Transient,98.0,1,2015-07-01
2,5,Resort Hotel,14,2015,7,27,1,0,2,2,BB,GBR,Online TA,A,Transient,98.0,1,2015-07-01
3,6,Resort Hotel,0,2015,7,27,1,0,2,2,BB,PRT,Direct,C,Transient,107.0,0,2015-07-01
4,7,Resort Hotel,9,2015,7,27,1,0,2,2,FB,PRT,Direct,C,Transient,103.0,1,2015-07-01


In [34]:
# Drop unamed row
df = df.drop(columns = ['Unnamed: 0'])


In [35]:
# Confirm there are no null values in the dataset
df.isnull().values.any()

False

In [36]:
df.dtypes

hotel                         object
lead_time                      int64
arrival_date_year              int64
arrival_date_month             int64
arrival_date_week_number       int64
arrival_date_day_of_month      int64
stays_in_weekend_nights        int64
stays_in_week_nights           int64
adults                         int64
meal                          object
country                       object
market_segment                object
reserved_room_type            object
customer_type                 object
adr                          float64
total_of_special_requests      int64
date                          object
dtype: object

In [37]:
# # get dummies
# dummy_columns = ['meal', 'reserved_room_type', 'market_segment', 'deposit_type', 'customer_type']
# df = pd.get_dummies(df, columns = dummy_columns)
# df.head()

In [38]:
# # Plot the data
# df.hist(figsize = (12,10))

In [39]:
# Correlation between numerical values
df.corr()

Unnamed: 0,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,adr,total_of_special_requests
lead_time,1.0,0.041679,0.130384,0.125707,0.000493,0.051026,0.137724,0.059307,-0.181161,-0.1217
arrival_date_year,0.041679,1.0,-0.510824,-0.524981,-0.002051,0.018572,0.033185,0.044925,0.213256,0.106124
arrival_date_month,0.130384,-0.510824,1.0,0.994504,-0.028305,0.015045,0.016438,0.014899,0.08792,0.027529
arrival_date_week_number,0.125707,-0.524981,0.994504,1.0,0.070055,0.015529,0.013436,0.012347,0.083494,0.025492
arrival_date_day_of_month,0.000493,-0.002051,-0.028305,0.070055,1.0,-0.010536,-0.02295,-0.001362,0.027724,0.000918
stays_in_weekend_nights,0.051026,0.018572,0.015045,0.015529,-0.010536,1.0,0.443477,0.082976,0.03164,0.071508
stays_in_week_nights,0.137724,0.033185,0.016438,0.013436,-0.02295,0.443477,1.0,0.090195,0.052733,0.070758
adults,0.059307,0.044925,0.014899,0.012347,-0.001362,0.082976,0.090195,1.0,0.2322,0.133184
adr,-0.181161,0.213256,0.08792,0.083494,0.027724,0.03164,0.052733,0.2322,1.0,0.185971
total_of_special_requests,-0.1217,0.106124,0.027529,0.025492,0.000918,0.071508,0.070758,0.133184,0.185971,1.0


In [40]:
# Split data into two dataframes based on 'City Hotel' and 'Resort Hotels'

def hotel_df(df, hotel):
    rslt_df = df.loc[df['hotel'].isin(hotel)]
    return rslt_df

City_hotel = hotel_df(df, ['City Hotel'])

Resort_hotel = hotel_df(df, ['Resort Hotel'])


In [41]:
City_hotel.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,meal,country,market_segment,reserved_room_type,customer_type,adr,total_of_special_requests,date
28119,City Hotel,88,2015,7,27,1,0,4,2,BB,PRT,Online TA,A,Transient,76.5,1,2015-07-01
28120,City Hotel,65,2015,7,27,1,0,4,1,BB,PRT,Online TA,A,Transient,68.0,1,2015-07-01
28121,City Hotel,92,2015,7,27,1,2,4,2,BB,PRT,Online TA,A,Transient,76.5,2,2015-07-01
28122,City Hotel,100,2015,7,27,2,0,2,2,BB,PRT,Online TA,A,Transient,76.5,1,2015-07-02
28123,City Hotel,79,2015,7,27,2,0,3,2,BB,PRT,Online TA,A,Transient,76.5,1,2015-07-02


In [42]:
Resort_hotel.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,meal,country,market_segment,reserved_room_type,customer_type,adr,total_of_special_requests,date
0,Resort Hotel,7,2015,7,27,1,0,1,1,BB,GBR,Direct,A,Transient,75.0,0,2015-07-01
1,Resort Hotel,14,2015,7,27,1,0,2,2,BB,GBR,Online TA,A,Transient,98.0,1,2015-07-01
2,Resort Hotel,14,2015,7,27,1,0,2,2,BB,GBR,Online TA,A,Transient,98.0,1,2015-07-01
3,Resort Hotel,0,2015,7,27,1,0,2,2,BB,PRT,Direct,C,Transient,107.0,0,2015-07-01
4,Resort Hotel,9,2015,7,27,1,0,2,2,FB,PRT,Direct,C,Transient,103.0,1,2015-07-01


In [43]:
print(Resort_hotel.shape)
print(City_hotel.shape)

(28119, 17)
(74242, 17)


In [44]:
print(Resort_hotel.dtypes)
print(City_hotel.dtypes)

hotel                         object
lead_time                      int64
arrival_date_year              int64
arrival_date_month             int64
arrival_date_week_number       int64
arrival_date_day_of_month      int64
stays_in_weekend_nights        int64
stays_in_week_nights           int64
adults                         int64
meal                          object
country                       object
market_segment                object
reserved_room_type            object
customer_type                 object
adr                          float64
total_of_special_requests      int64
date                          object
dtype: object
hotel                         object
lead_time                      int64
arrival_date_year              int64
arrival_date_month             int64
arrival_date_week_number       int64
arrival_date_day_of_month      int64
stays_in_weekend_nights        int64
stays_in_week_nights           int64
adults                         int64
meal                    

In [None]:
Resort_hotel.to_csv('../Resources/Resort_Hotel_Prep.csv')
City_hotel.to_csv('../Resources/City_hotel_Prep.csv')
