In [1]:
from lazypredict import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [2]:
# reading data files
data_h1 = pd.read_csv('./data/H1.csv')
data_h2 = pd.read_csv('./data/H2.csv')

In [3]:
# combing the datasets together
data_h12 = pd.concat([data_h1, data_h2], axis=0)

In [4]:
# Cleaning Column company
print(data_h12['company'].unique()[:20])

['       NULL' '110' '113' '270' '178' '240' '154' '144' '307' '268' '59'
 '204' '312' '318' '94' '174' '274' '195' '223' '317']


In [5]:
def clean_company(x):
    try:
        return 'company_'+x.split()[0]
    except:
        return 'company_unknown'

In [6]:
# converting into categorical values
data_h12['company'] = data_h12['company'].apply(clean_company)

data_h12['company'].unique()[:20]

array(['company_NULL', 'company_110', 'company_113', 'company_270',
       'company_178', 'company_240', 'company_154', 'company_144',
       'company_307', 'company_268', 'company_59', 'company_204',
       'company_312', 'company_318', 'company_94', 'company_174',
       'company_274', 'company_195', 'company_223', 'company_317'],
      dtype=object)

In [7]:
data_h12 = pd.concat([data_h12,
                      pd.get_dummies(data_h12['company'])], axis=1)

In [8]:
data_h12.drop('company', axis=1, inplace=True)

In [9]:
"""
Cleaning reservation_status_date
    1. Extracting Year, Month and Date from reservation_status_date and concatinating it to the dataframe data_h12 and dropping the reservation_status_date
    2. This datetime extraction gives us better features.
"""
RSD = pd.DataFrame(pd.to_datetime(data_h12['reservation_status_date']).astype('str'))

In [10]:
def clean_RSD(x, dt='d'):
    try:
        if dt=='d':
            dt_no = 2
        elif dt=='m':
            dt_no = 1
        elif dt=='y':
            dt_no = 0
        return list(x)[0].split('-')[dt_no]
    except:
        return x.split('-')[dt_no]

In [11]:
RSD['ReservationStatusDate_year'] = RSD['reservation_status_date'].apply(clean_RSD, args=('y')).astype(int)
RSD['ReservationStatusDate_month'] = RSD['reservation_status_date'].apply(clean_RSD, args=('m')).astype(int)
RSD['ReservationStatusDate_day'] = RSD['reservation_status_date'].apply(clean_RSD, args=('d')).astype(int)

In [12]:
data_h12 = pd.concat([data_h12, RSD.iloc[:, 1:]], axis=1)

In [13]:
data_h12.drop('reservation_status_date', axis=1, inplace=True)

In [14]:
# Cleaning arrival_date_month
# Mapping the Month Name into Month number
print(data_h12['arrival_date_month'].unique())

['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']


In [15]:
month_to_num = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September':'09',
    'October': '10',
    'November': '11',
    'December': '12'
}

In [16]:
data_h12['arrival_date_month'] = data_h12['arrival_date_month'].map(month_to_num).astype('int')

In [17]:
# Cleaning meal
#    1. Label Encoding and Binarizing
#    2. concatenating to data_h12
#    3. dropping original Meal
def meal_parser(x):
    if x!='Undefined':
        return x.split(' ')[0]
    else:
        return 'Undefined'

In [18]:
meal_data = data_h12['meal'].apply(meal_parser)
meal_data = pd.get_dummies(meal_data)

In [19]:
# adding `meal_data`
data_h12 = pd.concat([data_h12, meal_data], axis=1)
# dropping `meal`
data_h12.drop('meal', axis=1, inplace=True)

In [20]:
# Cleaning reservation_status
#    1. Label Encoding
#    2. Dropping reservation_status
data_h12 = pd.concat([data_h12, pd.get_dummies(data_h12['reservation_status'])], axis=1)

In [21]:
data_h12.drop('reservation_status', axis=1, inplace=True)

In [22]:
# Cleaning country
# one hot encoding `Country`
country_col = pd.get_dummies(data_h12['country'])

# creating column names
country_col_names = ['country_'+str(con) for con in list(country_col.columns)]
# adding column names
country_col.columns = country_col_names

# concatenating to the main DataFrame
#data_h12 = pd.concat([data_h12, country_col], axis=1)

In [23]:
print(data_h12.columns)

Index(['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month',
       'arrival_date_week_number', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
       ...
       'ReservationStatusDate_month', 'ReservationStatusDate_day', 'BB', 'FB',
       'HB', 'SC', 'Undefined', 'Canceled', 'Check-Out', 'No-Show'],
      dtype='object', length=282)


In [24]:
# Cleaning customer_type
#data_h12 = pd.concat([data_h12, pd.get_dummies(data_h12['customer_type'])],
#                    axis=1)
data_h12.drop('customer_type', axis=1, inplace=True)

In [25]:
# Cleaning distribution_channel
#data_h12 = pd.concat([data_h12, pd.get_dummies(data_h12['distribution_channel'])],
#                    axis=1)
data_h12.drop('distribution_channel', axis=1, inplace=True)

In [26]:
# Cleaning market_segment
#data_h12 = pd.concat([data_h12, pd.get_dummies(data_h12['market_segment'])],
#                    axis=1)
data_h12.drop('market_segment', axis=1, inplace=True)

In [27]:
drop_object_cols = ['country',
                    'reserved_room_type',
                    'assigned_room_type',
                    'deposit_type','agent', 'children']
data_h12.drop(drop_object_cols, axis=1, inplace=True)

In [28]:
# Label
y = data_h12['Canceled']
# Features
X = data_h12.drop(['Canceled', 'is_canceled', 'Check-Out'], axis=1)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.7, random_state=123)

In [30]:
clf = LazyClassifier(verbose=0,ignore_warnings=False, custom_metric=None)

In [31]:
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [04:11<00:00,  5.52s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.98,0.97,0.97,0.98,1.82
LGBMClassifier,0.97,0.94,0.94,0.96,0.72
BaggingClassifier,0.94,0.9,0.9,0.94,0.81
DecisionTreeClassifier,0.91,0.89,0.89,0.91,0.22
ExtraTreesClassifier,0.93,0.88,0.88,0.93,0.38
RandomForestClassifier,0.93,0.88,0.88,0.92,0.52
LabelPropagation,0.89,0.85,0.85,0.89,70.51
LabelSpreading,0.89,0.85,0.85,0.89,94.44
ExtraTreeClassifier,0.88,0.84,0.84,0.88,0.14
KNeighborsClassifier,0.89,0.82,0.82,0.88,16.38
