**In this file I present creating models with pipelines comparing results of cross-validated hyperparameters to achive best model's fitting on binned dataset**

Imports:

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import (StandardScaler, 
                                   OrdinalEncoder, 
                                   MinMaxScaler)

from sklearn.model_selection import (train_test_split, 
                                     GridSearchCV, 
                                     StratifiedKFold, 
                                     RandomizedSearchCV)

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             make_scorer, 
                             recall_score, 
                             confusion_matrix, 
                             accuracy_score,
                            get_scorer_names)
from sklearn.decomposition import PCA

Loading dataset:

In [2]:
data_clean = pd.read_pickle("data/data_bins.pkl")

In [13]:
data_clean.sample(5)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date
47019,City Hotel,1,31,zero,one_night,1,No_child,No_babies,BB,Hi-Freq,...,No Deposit,PRT,174.0,0,Transient-Party,71.0,0,0,2016-02-04,2016-02-04 00:00:00
25495,Resort Hotel,0,157,one_night,more,2,No_child,No_babies,BB,Freq,...,No Deposit,243.0,empty,0,Contract,64.9,0,1,2016-06-30,2016-06-23 00:00:00
50279,City Hotel,1,295,zero,one_night,2,No_child,No_babies,BB,Hi-Freq,...,Non Refund,1.0,empty,0,Transient,62.0,0,0,2015-10-21,2016-04-28 00:00:00
6081,Resort Hotel,0,223,one_night,more,2,No_child,No_babies,BB,Freq,...,Refundable,PRT,223.0,0,Transient-Party,70.29,1,0,2016-05-31,2016-05-24 00:00:00
112945,City Hotel,0,174,one_night,more,2,No_child,No_babies,HB,Lo_freq,...,No Deposit,9.0,empty,0,Transient,162.0,0,2,2017-05-31,2017-05-26 00:00:00


Dividing into predictor variables X and target y ("is_canceled"):

In [3]:
X = data_clean.drop("is_canceled", axis=1)
y = data_clean.is_canceled

Splitting dataset into train and test subsets with test size 30% and train 70%:

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42
                                                   )

Shape after division

In [5]:
X_train.shape

(83573, 27)

In [6]:
X_test.shape

(35817, 27)

Outlier value of column adr found in a file "Reservation_Cancelation_Prediction" now is to be replaced with mean of adr column.

In [7]:
(X_train["adr"]==5400).sum()

1

In [8]:
(X_test["adr"]==5400).sum()

0

In [9]:
if (X_train["adr"]==5400).sum() > 0:
    X_train.replace({5400.0:np.round(X_train.adr.mean(), 2)}, inplace=True) #filling inordinary adr value with mean of training set adr column
    print("Outlier observations in train subset = ", (X_train["adr"]==5400).sum())
elif (X_test["adr"]==5400).sum() > 0:
    X_test.replace({5400.0:np.round(X_train.adr.mean(), 2)}, inplace=True)
    print("Outlier observations in test subset = ", (X_test["adr"]==5400).sum())

Abnormal observations in train subset =  0


Encoding categorial columns with OrdinalEncoder:

In [11]:
data_cat = data_clean.select_dtypes(["object"]).columns

In [14]:
data_label_train = X_train[data_cat]
data_label_test = X_test[data_cat]

In [15]:
ode = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ode.fit(data_label_train)
data_label_train_ode = pd.DataFrame(ode.transform(data_label_train),
                                    columns=data_cat)
data_label_test_ode = pd.DataFrame(ode.transform(data_label_test), 
                                   columns=data_cat)

In [16]:
data_label_train_ode

Unnamed: 0,hotel,stays_in_weekend_nights,stays_in_week_nights,children,babies,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,agent,company,customer_type,reservation_status_date,arrival_date
0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,0.0,0.0,288.0,323.0,2.0,400.0,562.0
1,1.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,3.0,0.0,98.0,323.0,2.0,375.0,258.0
2,1.0,2.0,2.0,0.0,0.0,0.0,1.0,3.0,2.0,3.0,4.0,0.0,316.0,92.0,3.0,886.0,770.0
3,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,316.0,76.0,3.0,449.0,330.0
4,1.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,4.0,0.0,316.0,323.0,2.0,714.0,597.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83568,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,323.0,3.0,339.0,220.0
83569,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,143.0,323.0,2.0,640.0,660.0
83570,1.0,2.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,0.0,99.0,323.0,2.0,817.0,699.0
83571,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,193.0,323.0,2.0,304.0,231.0


Updating encoded columns:

In [20]:
X_train.drop(data_cat, axis=1, inplace=True)
X_test.drop(data_cat, axis=1, inplace=True)

KeyError: "['hotel', 'stays_in_weekend_nights', 'stays_in_week_nights', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'agent', 'company', 'customer_type', 'reservation_status_date', 'arrival_date'] not found in axis"

Concatenating encoded features with the rest:

In [21]:
X_train = pd.concat([X_train.reset_index(drop=True), data_label_train_ode.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), data_label_test_ode.reset_index(drop=True)], axis=1)

In [22]:
X_train.shape

(83573, 27)

Encoding with get_dummies:

In [23]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [24]:
X_test = pd.get_dummies(X_test, drop_first=True)
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

In [25]:
X_train.shape

(83573, 27)

Initiating StandardScaler for further data scaling:

In [26]:
scaler = StandardScaler()

Initiating Principal Components with ten components reducing dimentions to ten components :

In [27]:
pca = PCA(n_components=10)

Initiating algorithm to ballance unballanced data- SMOTEENN:

In [28]:
SMOTEEN = SMOTEENN()

RandomForestClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=11)
#imbpipeline
pipeline_rf = imbpipeline(steps=[
    ['scaler', scaler],
    ['pca', pca],
    ['smote', SMOTEEN],
    ['rf', RandomForestClassifier()]])
    
param_distributions_rf = {
    'rf__n_estimators': [20, 100, 300],
    'rf__max_depth': [10, 20],
    'rf__min_samples_split': [5, 10],
    'pca__n_components': [5, 10, 20]
}

search_rf = RandomizedSearchCV(pipeline_rf, 
                               param_distributions_rf, 
                               n_iter=10, 
                               cv=stratified_kfold, 
                               scoring='roc_auc',
                               verbose=3
                              )

search_rf.fit(X_train, y_train)
y_pred_rf = search_rf.best_estimator_.predict(X_test)
print("Random Forest:")
print(search_rf.best_params_)
print(f'Results on test: {search_rf.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_rf.best_estimator_.score(X_train, y_train)}')

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [None]:
#print(get_scorer_names())

In [None]:
y_pred_rf

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
A_report_rf = pd.DataFrame(classification_report(y_test, y_pred_rf, output_dict=True))

DecisionTreeClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)

pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['dtc', DecisionTreeClassifier()]])

    
param_grid = {'dtc__max_leaf_nodes' : [2, 5, 10, 30], 
             'dtc__max_depth': [4, 10, 20, 40],
             'dtc__random_state' : [23],
             'pca__n_components': [5, 10, 20]
             }

search_dtc = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,                           
                          verbose=3,
                           #n_jobs=3
                         )

search_dtc.fit(X_train, y_train)
y_pred_dtc = search_dtc.best_estimator_.predict(X_test)
cv_score = search_dtc.best_score_
test_score = search_dtc.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("Decision Tree:")
print(search_rf.best_params_)
print(f'Results on test: {search_rf.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_rf.best_estimator_.score(X_train, y_train)}')

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [None]:
y_pred_dtc

In [None]:
print(classification_report(y_test, y_pred_dtc))
A_report_dtc = pd.DataFrame(classification_report(y_test, y_pred_dtc, output_dict=True))

Support Vector Classifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [19]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=23)

pipeline_SVC = imbpipeline([('scaler', scaler),
                            ('pca', pca),
                            ('SMOTE', SMOTEEN),
                            ('SVC', SVC())])
    
params_SVC = {
              'SVC__gamma': ['auto'],# [10, 20, 50]
              'SVC__max_iter': [150, 300, 500],
              'SVC__decision_function_shape': ['ovo'],
              'SVC__degree': [1], #, 3, 5],
              'SVC__kernel': ['rbf'],
              'SVC__random_state': [11],
              'pca__n_components': [5, 10, 20]
             }

search_SVC = GridSearchCV(pipeline_SVC,
                             params_SVC,
                             scoring='roc_auc',
                             cv=stratified_kfold,
                            verbose=3,
                            #n_jobs=3
                         )

search_SVC.fit(X_train, y_train)

cv_score = search_SVC.best_score_
test_score = search_SVC.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("Support Vector:")
print(search_SVC.best_params_)
print(f'Results on test: {search_SVC.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_SVC.best_estimator_.score(X_train, y_train)}')

NameError: name 'scaler' is not defined

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [None]:
y_pred_SVC_train = search_SVC.best_estimator_.predict(X_train)

In [None]:
y_pred_svc_test = search_SVC.best_estimator_.predict(X_test)

In [None]:
y_pred_SVC = search_SVC.predict(X_test)

In [None]:
search_SVC.best_params_

In [None]:
print(classification_report(y_test, y_pred_SVC))
A_report_svc = pd.DataFrame(classification_report(y_test, y_pred_SVC, output_dict=True))

In [None]:
A_report_svc

In [None]:
type(classification_report(y_test, y_pred_SVC))

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

XGBClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=77)

pipeline = imbpipeline(steps=[('scaler', scaler),
                              ('pca', pca),
                              ('smote', SMOTEEN),
                              ('XGB', XGBClassifier())])

params = {
    'XGB__n_estimators': [100, 300, 500, 800],
    'XGB__max_depth': [3, 5, 7, 10],
    'XGB__learning_rate': [0.01, 0.1, 0.5, 1],
    'pca__n_components': [5, 10, 20]
    }

search_XGB = GridSearchCV(pipeline, 
                          params, 
                          scoring='roc_auc', 
                          cv=stratified_kfold, 
                          verbose=3,
                        #n_jobs=3
                         ) 

search_XGB.fit(X_train, y_train)
accuracy_score(y_test, search_XGB.predict(X_test))

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [None]:
#XGBClassifier().get_params().keys()

In [None]:
search_XGB.cv_results_["mean_test_score"]

In [None]:
y_pred_XGB = search_XGB.best_estimator_.predict(X_test)
test_score = search_XGB.score(X_test, y_test)
cv_score = search_XGB.best_score_

In [None]:
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("XGBClassifier:")
print(search_XGB.best_params_)
print(f'Results on test: {search_XGB.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_XGB.best_estimator_.score(X_train, y_train)}')

In [None]:
print(classification_report(y_test, y_pred_XGB))
A_report_xgb = pd.DataFrame(classification_report(y_test, y_pred_XGB, output_dict=True))

LogisticRegression algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [None]:
pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['LR', LogisticRegression()]])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)
    
param_grid = {'LR__C':[20, 50, 70, 80],
             'LR__random_state': [11],
             'LR__multi_class': ['auto'],
             'LR__max_iter': [12, 25, 50, 100],
             'LR__solver': ['saga'],
             'LR__penalty': ['l2', 'l1'],
             'pca__n_components': [5, 10, 20]
             }
                                                                 
search_LR = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           verbose=3,
                           #n_jobs=3
                        )

search_LR.fit(X_train, y_train)
cv_score = search_LR.best_score_
test_score = search_LR.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

In [None]:
search_LR.best_params_

In [None]:
y_pred_lr = search_LR.best_estimator_.predict(X_test)

In [None]:
test_score = search_LR.score(X_test, y_test)

In [None]:
print(classification_report(y_test, y_pred_lr))
A_report_rl = pd.DataFrame(classification_report(y_test, y_pred_lr, output_dict=True))

Utilizing Multi Layer Perceptron algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [None]:
pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['MLP', MLPClassifier()]])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)
    
param_grid = {'MLP__hidden_layer_sizes':[8, 4, 16],
             'MLP__activation': ['relu'],
              'MLP__solver': ['adam'],
              'MLP__random_state': [42],
              'MLP__max_iter': [1000],
              'MLP__batch_size': [32],
              'pca__n_components': [5, 10, 20]
             }
                                                                 
search_MLP = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           verbose=3,
                           #n_jobs=3
                        )

search_MLP.fit(X_train, y_train)
cv_score = search_MLP.best_score_
test_score = search_MLP.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [None]:
y_pred_mlp = MLP.predict(X_test)
print(classification_report(y_test, y_pred_mlp))
A_report_mlp = pd.DataFrame(classification_report(y_test, y_pred_mlp, output_dict=True))