In [1]:
# Import Pandas for data structure manipulations.
import pandas as pd

# Import NumPy for numerical analysis.
import numpy as np

# Import a timer.
import time

from category_encoders import TargetEncoder

# Import the train_test_split, cross-validation, and grid search modules from Scikit-Learn.
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Import the One-Hot Encoder, Target Encoder, StandardScaler for later categorical to numeric encoding.
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Import the column transformer that can be used for scaling purposes in later steps.
from sklearn.compose import make_column_transformer

# Import the pipeline module for step-wise transformations in later steps.
from sklearn.pipeline import make_pipeline

# Import the mean_squared_error module from Scikit-Learn as a quick error metric.
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error

# Import DecisionTreeClassifer and tree visualizatoin methods from Scikit-Learn.
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Use the experimental method in Scikit-Learn to enable HistGradientBoostingClassifier.
from sklearn.experimental import enable_hist_gradient_boosting

# Import the RandomForestClassifier, GradientBoostingClassifier, and HistGradientBoostingClassifier from Scikit-Learn.
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

# Import the XGBoost module from Scikit-Learn. Be sure to 'brew install cmake' in CLI.
# Next, 'pip install xgboost' in CLI or do a '!pip install xgboost' in iPython. This makes XGBoost accessible.
!pip install xgboost
import xgboost as xgb

# Import MatPlotLib for data exploration and visualizations.
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# Ignore unnecessary warnings. This will clean up output appearances.
import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE





  from pandas import MultiIndex, Int64Index


In [2]:
# Specify the name of the local 'data' directory data files will be stored.
data_folder = 'data/'

# Specify the data file extension .csv that training and test sets will be pulled from.
csv_extension = '.csv'

In [3]:
# Grab the training data available from the data directory.
train = pd.read_csv(data_folder + 'train' + csv_extension, parse_dates = [0,11,12])

# Observe the first 5 items of the training set.
train.head(5)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [4]:
# View the training set dimensions.
# There are 37,670,293 samples (i.e. user instances) and 24 features in the training set. 
print(f'\n\033[1mTraining Set Row Count:\033[0m {train.shape[0]}')
print(f'\n\033[1mTraining Set Feature Count:\033[0m {train.shape[1]}')


[1mTraining Set Row Count:[0m 37670293

[1mTraining Set Feature Count:[0m 24


In [5]:
print('shape of training set: ', train.shape)
train_cleaned = train.drop(columns=['orig_destination_distance'], axis = 1)
####################################################################################
# add transform of the column 'orig_destination_distance' here
####################################################################################
train_cleaned = train_cleaned.dropna()
print('shape of cleaned set: ', train_cleaned.shape)

shape of training set:  (37670293, 24)
shape of cleaned set:  (37623205, 23)


In [6]:
train_cleaned['srch_ci'] = pd.to_datetime(train_cleaned['srch_ci'], errors='coerce')
train_cleaned['srch_co'] = pd.to_datetime(train_cleaned['srch_co'], errors='coerce')
train_cleaned['srch_before_ci'] = pd.to_numeric((train_cleaned['srch_ci'] - train_cleaned['date_time']).dt.days, errors='coerce', downcast='integer')
train_cleaned['srch_trip_duration'] = pd.to_numeric((train_cleaned['srch_co'] - train_cleaned['srch_ci']).dt.days, errors='coerce', downcast='integer')
train_cleaned['day_of_year_srch'] = train_cleaned['date_time'].dt.dayofyear
train_cleaned = train_cleaned.dropna()
train_cleaned.reset_index(drop=True, inplace = True)
print('shape of cleaned set: ', train_cleaned.shape)

shape of cleaned set:  (37623199, 26)


In [8]:
train_cleaned.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,...,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,srch_before_ci,srch_trip_duration,day_of_year_srch
0,2014-08-11 07:46:59,2,3,66,348,48862,12,0,1,9,...,1,0,3,2,50,628,1,15.0,4.0,223
1,2014-08-11 08:22:12,2,3,66,348,48862,12,0,1,9,...,1,1,1,2,50,628,1,17.0,4.0,223
2,2014-08-11 08:24:33,2,3,66,348,48862,12,0,0,9,...,1,0,1,2,50,628,1,17.0,4.0,223
3,2014-08-09 18:05:16,2,3,66,442,35390,93,0,0,3,...,1,0,1,2,50,1457,80,105.0,5.0,221
4,2014-08-09 18:08:18,2,3,66,442,35390,93,0,0,3,...,1,0,1,2,50,1457,21,105.0,5.0,221


In [9]:
train_cleaned.groupby('hotel_continent').count()

Unnamed: 0_level_0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,...,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_country,hotel_market,hotel_cluster,srch_before_ci,srch_trip_duration,day_of_year_srch
hotel_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,711080,711080,711080,711080,711080,711080,711080,711080,711080,711080,...,711080,711080,711080,711080,711080,711080,711080,711080,711080,711080
1,214,214,214,214,214,214,214,214,214,214,...,214,214,214,214,214,214,214,214,214,214
2,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872,...,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872,19755872
3,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547,...,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547,4886547
4,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502,...,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502,4333502
5,373236,373236,373236,373236,373236,373236,373236,373236,373236,373236,...,373236,373236,373236,373236,373236,373236,373236,373236,373236,373236
6,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748,...,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748,7562748


In [10]:
col_names = pd.DataFrame([
u'\u2713',
'T',
u'\u2717',
'T',
'T',

'T',
u'\u2717',
u'\u2717',
u'\u2713',
'T',
    
u'\u2717',
u'\u2717',
u'\u2713',
u'\u2713',
u'\u2713',

u'\u2713'+'delete after join',
'O',
u'\u2713',
'T',
'T',
    
'T',
'T',
'depends on model',
u'\u2713',
u'\u2713',
u'\u2713'+' + T'], index=train_cleaned.columns)
col_names

Unnamed: 0,0
date_time,✓
site_name,T
posa_continent,✗
user_location_country,T
user_location_region,T
user_location_city,T
user_id,✗
is_mobile,✗
is_package,✓
channel,T


* note here: 'orig_destination_distance' has been dropped \
✓: use directly \
✗: drop \
T: target encoding \
O: one-hot encoding\

* would use the probability of book or not (mean of is_booking) for target encoding

In [11]:
# add all columns that need target encoding to list 'TE_col'
train_cleaned['encoded_day_of_year_srch'] = train_cleaned['day_of_year_srch']
TE_col = [
    'site_name',
    'user_location_country',
    'user_location_region',
    'user_location_city',
    'channel',
    'cnt',
    'hotel_continent',
    'hotel_country',
    'hotel_market',
    'encoded_day_of_year_srch'
]
train_cleaned[TE_col] = TargetEncoder(
    cols = TE_col, 
    smoothing=0
).fit_transform(train_cleaned[TE_col].astype('category'), train_cleaned['is_booking'])

In [12]:
# add all columns that need one-hot encoding to list 'OHE_col'
OHE = OneHotEncoder()
OHE_col = [
    'srch_destination_type_id',
]
train_cleaned = pd.concat(
    [
        train_cleaned,
        pd.DataFrame(
            OHE.fit_transform(train_cleaned[OHE_col].astype('category')).toarray(), 
            columns=OHE.get_feature_names_out(OHE_col)
        )
    ],
    axis=1
)
train_cleaned = train_cleaned.drop(columns=OHE_col, axis=1)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,...,srch_destination_type_id_0,srch_destination_type_id_1,srch_destination_type_id_2,srch_destination_type_id_3,srch_destination_type_id_4,srch_destination_type_id_5,srch_destination_type_id_6,srch_destination_type_id_7,srch_destination_type_id_8,srch_destination_type_id_9
0,2014-08-11 07:46:59,0.082752,3,0.082413,0.069950,0.080693,12,0,1,0.084775,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2014-08-11 08:22:12,0.082752,3,0.082413,0.069950,0.080693,12,0,1,0.084775,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2014-08-11 08:24:33,0.082752,3,0.082413,0.069950,0.080693,12,0,0,0.084775,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2014-08-09 18:05:16,0.082752,3,0.082413,0.088094,0.086438,93,0,0,0.056664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2014-08-09 18:08:18,0.082752,3,0.082413,0.088094,0.086438,93,0,0,0.056664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37623194,2014-09-02 08:08:28,0.082752,3,0.082413,0.081872,0.086816,1198182,0,1,0.060460,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37623195,2014-09-08 14:52:52,0.082752,3,0.082413,0.081872,0.086816,1198182,0,0,0.073141,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37623196,2014-09-15 06:56:51,0.082752,3,0.082413,0.081872,0.086816,1198182,0,0,0.073141,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37623197,2014-09-18 08:49:33,0.082752,3,0.082413,0.080938,0.099561,1198182,0,0,0.073141,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# drop unnecessary columns
train_cleaned = train_cleaned.drop(columns=['posa_continent','user_id','is_mobile','srch_ci','srch_co'], axis = 1)
train_cleaned

Unnamed: 0,date_time,site_name,user_location_country,user_location_region,user_location_city,is_package,channel,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,...,srch_destination_type_id_0,srch_destination_type_id_1,srch_destination_type_id_2,srch_destination_type_id_3,srch_destination_type_id_4,srch_destination_type_id_5,srch_destination_type_id_6,srch_destination_type_id_7,srch_destination_type_id_8,srch_destination_type_id_9
0,2014-08-11 07:46:59,0.082752,0.082413,0.069950,0.080693,1,0.084775,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2014-08-11 08:22:12,0.082752,0.082413,0.069950,0.080693,1,0.084775,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2014-08-11 08:24:33,0.082752,0.082413,0.069950,0.080693,0,0.084775,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2014-08-09 18:05:16,0.082752,0.082413,0.088094,0.086438,0,0.056664,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2014-08-09 18:08:18,0.082752,0.082413,0.088094,0.086438,0,0.056664,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37623194,2014-09-02 08:08:28,0.082752,0.082413,0.081872,0.086816,1,0.060460,2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37623195,2014-09-08 14:52:52,0.082752,0.082413,0.081872,0.086816,0,0.073141,1,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37623196,2014-09-15 06:56:51,0.082752,0.082413,0.081872,0.086816,0,0.073141,1,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37623197,2014-09-18 08:49:33,0.082752,0.082413,0.080938,0.099561,0,0.073141,1,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
train_cleaned.to_csv('train_cleaned.csv')

In [17]:
train_cleaned.columns

Index(['date_time', 'site_name', 'user_location_country',
       'user_location_region', 'user_location_city', 'is_package', 'channel',
       'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt',
       'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster',
       'srch_before_ci', 'srch_trip_duration', 'day_of_year_srch',
       'encoded_day_of_year_srch', 'srch_destination_type_id_0',
       'srch_destination_type_id_1', 'srch_destination_type_id_2',
       'srch_destination_type_id_3', 'srch_destination_type_id_4',
       'srch_destination_type_id_5', 'srch_destination_type_id_6',
       'srch_destination_type_id_7', 'srch_destination_type_id_8',
       'srch_destination_type_id_9'],
      dtype='object')