In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error,classification_report

from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from xgboost import plot_importance

from numpy import sort
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import sklearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from collections import Counter
from matplotlib import pyplot
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import RandomizedSearchCV

## Import datasets

In [2]:
train = pd.read_csv('Hotel-A-train.csv')
validation = pd.read_csv('Hotel-A-validation.csv')
test = pd.read_csv('Hotel-A-test.csv')
train.head()

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,...,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
0,39428300,F,40,Latino,Grad,<25K,North,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,Yes,10,218
1,77491756,F,49,Latino,Mid-School,50K -- 100K,East,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,Refundable,Online,Yes,Check-In,No,0,185
2,73747291,F,42,caucasian,Grad,<25K,East,City Hotel,7/2/2015,7/6/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,No,0,119
3,67301739,M,25,African American,College,>100K,South,Airport Hotels,7/2/2015,7/3/2015,...,BB,No,No,Refundable,Agent,Yes,Check-In,Yes,5,144
4,77222321,F,62,Latino,High-School,25K --50K,East,Resort,7/3/2015,7/4/2015,...,BB,No,No,No Deposit,Direct,No,Check-In,Yes,10,242


## Prepare training and validation data

In [3]:
def extract_data(train_data):
    Reservation_ID = train_data.pop("Reservation-id")
    y_train = train_data.pop("Reservation_Status")

    #change y_train to classes for turning this as a classification problem
    y_train = y_train.map({'Check-In':1, 'Canceled':2, 'No-Show':3})

    #calculate no-of check in days and try the possiblities it is weekend or not
    train_data["Expected_checkin_days"] = pd.to_datetime(train_data["Expected_checkout"]) - pd.to_datetime(train_data["Expected_checkin"])
    train_data["Expected_checkin_days"] = [int(str(x).split(" ")[0]) for x in train_data["Expected_checkin_days"]]
    
    train_data['CheckIn_Weekday'] = pd.to_datetime(train_data['Expected_checkin']).dt.dayofweek  # monday = 0, sunday = 6

    train_data['CheckInweekend_indi'] = 0          # Initialize the column with default value of 0
    train_data.loc[train_data['CheckIn_Weekday'].isin([5, 6]), 'CheckInweekend_indi'] = 1  # 5 and 6 correspond to Sat and Sun 
    
    train_data["AdvanceBook_Period"] = pd.to_datetime(train_data["Expected_checkin"]) - pd.to_datetime(train_data["Booking_date"])
    train_data["AdvanceBook_Period"] = [int(str(x).split(" ")[0]) for x in train_data["AdvanceBook_Period"]]
    
    train_data.pop("Expected_checkout")
    train_data.pop("Expected_checkin")

    train_data.pop("Booking_date")

    ## difference between checkin-day and booked day
    # for age use intervals

    train_data["Gender"] = train_data["Gender"].map({'F':0, 'M':1})
    train_data["Required_Car_Parking"] = train_data["Required_Car_Parking"].map({'No':0, 'Yes':1})
    train_data["Use_Promotion"] = train_data["Use_Promotion"].map({'No':0, 'Yes':1})
    train_data["Booking_channel"] = train_data["Booking_channel"].map({'Online':0, 'Direct':1, 'Agent':2})
    train_data["Deposit_type"] = train_data["Deposit_type"].map({'No Deposit':0, 'Refundable':1, 'Non-Refundable':2})
    train_data["Previous_Cancellations"] = train_data["Previous_Cancellations"].map({'No':0, 'Yes':1})
    train_data["Visted_Previously"] = train_data["Visted_Previously"].map({'No':0, 'Yes':1})
    train_data["Meal_Type"] = train_data["Meal_Type"].map({'BB':0, 'HB':1, 'FB':2})

    train_data["Ethnicity"] = train_data["Ethnicity"].map({'African American':0, 'caucasian':1, 'Asian American':2, "Latino":3})
    train_data["Educational_Level"] = train_data["Educational_Level"].map({'College':0, 'Mid-School':1, 'High-School':2, "Grad":3})
    train_data["Income"] = train_data["Income"].map({'<25K':0, '25K --50K':1, '50K -- 100K':2, '>100K':3})

    changes=["Country_region","Hotel_Type"]
    dummies=pd.concat([pd.get_dummies(train_data[col]) for col in changes], axis=1)
    train_data=pd.concat([train_data,dummies],axis=1)

    [train_data.pop(col) for col in changes]

    train_data["Age"] = train_data["Age"]//20
    train_data["Discount_Rate"] = train_data["Discount_Rate"]//5
    train_data["Room_Rate"] = train_data["Room_Rate"]//40
    
    return Reservation_ID, y_train, train_data

In [4]:
train_id, y_train, train_data = extract_data(train)
val_id, y_val, val_data = extract_data(validation)

## Downsampling

In [5]:
from imblearn.under_sampling import NearMiss
undersample = NearMiss({1:4000,2:2900,3:2125},version=3, n_neighbors=3)

X_tl, y_tl = undersample.fit_resample(train_data,y_train)
pd.Series(y_tl).value_counts()



1    4000
2    2900
3    2125
Name: Reservation_Status, dtype: int64

## Model parameter tuning

In [6]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 50, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
#print(random_grid)

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

model = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=33, n_jobs = -1)
model.fit(X_tl, y_tl)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 14, 18, 22, 26, 30,
                                                      34, 38, 42, 46, 50,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 288, 377, 466,
                                                         555, 644, 733, 822,
                                                         911, 1000]},
                   random_state=33, verbose=2)

In [8]:
print(model.best_params_)

{'n_estimators': 466, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}


## Test data preparation

In [9]:
def extract_test_data(train_data):
    Reservation_ID = train_data.pop("Reservation-id")
    #y_train = train_data.pop("Reservation_Status")

    #change y_train to classes for turning this as a classification problem
    #y_train = y_train.map({'Check-In':1, 'Canceled':2, 'No-Show':3})

    #calculate no-of check in days and try the possiblities it is weekend or not
    train_data["Expected_checkin_days"] = pd.to_datetime(train_data["Expected_checkout"]) - pd.to_datetime(train_data["Expected_checkin"])
    train_data["Expected_checkin_days"] = [int(str(x).split(" ")[0]) for x in train_data["Expected_checkin_days"]]
    
    train_data['CheckIn_Weekday'] = pd.to_datetime(train_data['Expected_checkin']).dt.dayofweek  # monday = 0, sunday = 6

    train_data['CheckInweekend_indi'] = 0          # Initialize the column with default value of 0
    train_data.loc[train_data['CheckIn_Weekday'].isin([5, 6]), 'CheckInweekend_indi'] = 1  # 5 and 6 correspond to Sat and Sun 
    
    train_data["AdvanceBook_Period"] = pd.to_datetime(train_data["Expected_checkin"]) - pd.to_datetime(train_data["Booking_date"])
    train_data["AdvanceBook_Period"] = [int(str(x).split(" ")[0]) for x in train_data["AdvanceBook_Period"]]
    
    train_data.pop("Expected_checkout")
    train_data.pop("Expected_checkin")

    train_data.pop("Booking_date")

    ## difference between checkin-day and booked day
    # for age use intervals

    train_data["Gender"] = train_data["Gender"].map({'F':0, 'M':1})
    train_data["Required_Car_Parking"] = train_data["Required_Car_Parking"].map({'No':0, 'Yes':1})
    train_data["Use_Promotion"] = train_data["Use_Promotion"].map({'No':0, 'Yes':1})
    train_data["Booking_channel"] = train_data["Booking_channel"].map({'Online':0, 'Direct':1, 'Agent':2})
    train_data["Deposit_type"] = train_data["Deposit_type"].map({'No Deposit':0, 'Refundable':1, 'Non-Refundable':2})
    train_data["Previous_Cancellations"] = train_data["Previous_Cancellations"].map({'No':0, 'Yes':1})
    train_data["Visted_Previously"] = train_data["Visted_Previously"].map({'No':0, 'Yes':1})
    train_data["Meal_Type"] = train_data["Meal_Type"].map({'BB':0, 'HB':1, 'FB':2})

    train_data["Ethnicity"] = train_data["Ethnicity"].map({'African American':0, 'caucasian':1, 'Asian American':2, "Latino":3})
    train_data["Educational_Level"] = train_data["Educational_Level"].map({'College':0, 'Mid-School':1, 'High-School':2, "Grad":3})
    train_data["Income"] = train_data["Income"].map({'<25K':0, '25K --50K':1, '50K -- 100K':2, '>100K':3})

    changes=["Country_region","Hotel_Type"]
    dummies=pd.concat([pd.get_dummies(train_data[col]) for col in changes], axis=1)
    train_data=pd.concat([train_data,dummies],axis=1)

    [train_data.pop(col) for col in changes]

    train_data["Age"] = train_data["Age"]//20
    train_data["Discount_Rate"] = train_data["Discount_Rate"]//5
    train_data["Room_Rate"] = train_data["Room_Rate"]//40
    
    return Reservation_ID, train_data

## F1 score on validation data

In [10]:
y_pred_val = model.predict(val_data)
accuracy = accuracy_score(y_val, y_pred_val)
print("Accuracy: %.2f%%" % (accuracy*100.0))

print(classification_report(y_val,y_pred_val))

Accuracy: 50.56%
              precision    recall  f1-score   support

           1       0.60      0.72      0.66      1610
           2       0.29      0.29      0.29       741
           3       0.19      0.03      0.04       398

    accuracy                           0.51      2749
   macro avg       0.36      0.35      0.33      2749
weighted avg       0.46      0.51      0.47      2749



In [11]:
op = {'Reservation-id':val_id,'Reservation_status':y_pred_val}
df = pd.DataFrame(op,columns=['Reservation-id','Reservation_status'])
vexport_csv=df.to_csv('prediction_val1.csv',index=None,header=True)

## Prediction

In [12]:
test_id,test_data = extract_test_data(test)

In [13]:
y_pred_test = model.predict(test_data)

In [14]:
# op = {'Reservation-id':test_id,'Reservation_status':y_pred_test}
# df = pd.DataFrame(op,columns=['Reservation-id','Reservation_status'])
# df.head(20)
# vexport_csv=df.to_csv('prediction_day3_3.csv',index=None,header=True)