**In this file I present creating models with pipelines comparing results of cross-validated hyperparameters to achive best model's fitting on binned dataset. Summary.**

Imports:

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import (StandardScaler, 
                                   OrdinalEncoder, 
                                   MinMaxScaler)

from sklearn.model_selection import (train_test_split, 
                                     GridSearchCV, 
                                     StratifiedKFold, 
                                     RandomizedSearchCV)

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             make_scorer, 
                             recall_score, 
                             confusion_matrix, 
                             accuracy_score,
                            get_scorer_names)
from sklearn.decomposition import PCA

Loading dataset:

In [2]:
data_clean = pd.read_pickle("data/data_bins.pkl")

In [3]:
data_clean.sample(5)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date
119260,City Hotel,0,292,more,more,2,No_child,No_babies,BB,Freq,...,No Deposit,9.0,empty,0,Transient,128.1,0,2,2017-09-01,2017-08-20 00:00:00
1744,Resort Hotel,1,172,one_night,one_night,2,No_child,No_babies,BB,Hi-Freq,...,No Deposit,15.0,empty,0,Transient-Party,36.0,0,0,2015-08-06,2015-09-12 00:00:00
6571,Resort Hotel,1,250,one_night,more,2,No_child,No_babies,BB,Hi-Freq,...,No Deposit,240.0,empty,0,Transient,99.0,0,2,2016-01-18,2016-06-18 00:00:00
114102,City Hotel,0,423,zero,more,2,No_child,No_babies,BB,Freq,...,No Deposit,229.0,empty,0,Transient-Party,112.67,0,1,2017-06-17,2017-06-14 00:00:00
104941,City Hotel,0,21,one_night,one_night,1,No_child,No_babies,BB,Freq,...,No Deposit,9.0,empty,0,Transient,67.58,0,1,2017-01-26,2017-01-22 00:00:00


Dividing into predictor variables X and target y ("is_canceled"):

In [4]:
X = data_clean.drop("is_canceled", axis=1)
y = data_clean.is_canceled

Splitting dataset into train and test subsets with test size 30% and train 70%:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42
                                                   )

Shape after division

In [6]:
X_train.shape

(83573, 27)

In [7]:
X_test.shape

(35817, 27)

Inputting NaNs in country column with the most frequent value ()max of train subset into train and test:

In [8]:
country_input = X_train["country"][X_train.country.value_counts().max()]

In [9]:
X_train.country.fillna(country_input, inplace=True)

In [10]:
X_test.country.fillna(country_input, inplace=True)

Inputting NaNs in agent column with the most frequent value ()max of train subset into train and test:

In [11]:
agent_input = X_train["agent"][X_train.agent.value_counts().max()]

In [12]:
X_train.agent.fillna(agent_input, inplace=True)

In [13]:
X_test.agent.fillna(agent_input, inplace=True)

Outlier value of column adr found in a file "Reservation_Cancelation_Prediction" now is to be replaced with mean of adr column.

In [14]:
(X_train["adr"]==5400).sum()

1

In [15]:
(X_test["adr"]==5400).sum()

0

In [16]:
if (X_train["adr"]==5400).sum() > 0:
    X_train.replace({5400.0:np.round(X_train.adr.mean(), 2)}, inplace=True) #filling inordinary adr value with mean of training set adr column
    print("Outlier observations in train subset = ", (X_train["adr"]==5400).sum())
elif (X_test["adr"]==5400).sum() > 0:
    X_test.replace({5400.0:np.round(X_train.adr.mean(), 2)}, inplace=True)
    print("Outlier observations in test subset = ", (X_test["adr"]==5400).sum())

Outlier observations in train subset =  0


Encoding categorial columns with OrdinalEncoder:

In [17]:
data_cat = data_clean.select_dtypes(["object"]).columns

In [18]:
data_label_train = X_train[data_cat]
data_label_test = X_test[data_cat]

In [19]:
ode = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ode.fit(data_label_train)
data_label_train_ode = pd.DataFrame(ode.transform(data_label_train),
                                    columns=data_cat)
data_label_test_ode = pd.DataFrame(ode.transform(data_label_test), 
                                   columns=data_cat)

In [20]:
data_label_train_ode

Unnamed: 0,hotel,stays_in_weekend_nights,stays_in_week_nights,children,babies,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,agent,company,customer_type,reservation_status_date,arrival_date
0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,0.0,0.0,288.0,323.0,2.0,400.0,562.0
1,1.0,2.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,3.0,0.0,98.0,323.0,2.0,375.0,258.0
2,1.0,2.0,2.0,0.0,0.0,0.0,1.0,3.0,2.0,3.0,4.0,0.0,316.0,92.0,3.0,886.0,770.0
3,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,316.0,76.0,3.0,449.0,330.0
4,1.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,4.0,0.0,316.0,323.0,2.0,714.0,597.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83568,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,323.0,3.0,339.0,220.0
83569,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,143.0,323.0,2.0,640.0,660.0
83570,1.0,2.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,0.0,99.0,323.0,2.0,817.0,699.0
83571,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,193.0,323.0,2.0,304.0,231.0


Updating encoded columns:

In [21]:
X_train.drop(data_cat, axis=1, inplace=True)
X_test.drop(data_cat, axis=1, inplace=True)

Concatenating encoded features with the rest:

In [22]:
X_train = pd.concat([X_train.reset_index(drop=True), data_label_train_ode.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), data_label_test_ode.reset_index(drop=True)], axis=1)

In [23]:
X_train.shape

(83573, 27)

Encoding with get_dummies:

In [24]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [25]:
X_test = pd.get_dummies(X_test, drop_first=True)
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

In [26]:
X_train.shape

(83573, 27)

Initiating StandardScaler for further data scaling:

In [27]:
scaler = StandardScaler()

Initiating Principal Components with ten components reducing dimentions to ten components :

In [28]:
pca = PCA(n_components=10)

Initiating algorithm to ballance unballanced data- SMOTEENN:

In [29]:
SMOTEEN = SMOTEENN()

RandomForestClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [30]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=11)
#imbpipeline
pipeline_rf = imbpipeline(steps=[
    ['scaler', scaler],
    ['pca', pca],
    ['smote', SMOTEEN],
    ['rf', RandomForestClassifier()]])
    
param_distributions_rf = {
    'rf__n_estimators': [20, 100],
    'rf__max_depth': [10, 20],
    'rf__min_samples_split': [5, 10],
    'pca__n_components': [5, 10, 20]
}

search_rf = RandomizedSearchCV(pipeline_rf, 
                               param_distributions_rf, 
                               n_iter=10, 
                               cv=stratified_kfold, 
                               scoring='roc_auc',
                               verbose=3
                              )

search_rf.fit(X_train, y_train)
y_pred_rf = search_rf.best_estimator_.predict(X_test)
print("Random Forest:")
print(search_rf.best_params_)
print(f'Results on test: {search_rf.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_rf.best_estimator_.score(X_train, y_train)}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END pca__n_components=20, rf__max_depth=10, rf__min_samples_split=10, rf__n_estimators=100;, score=0.890 total time=  44.2s
[CV 2/5] END pca__n_components=20, rf__max_depth=10, rf__min_samples_split=10, rf__n_estimators=100;, score=0.890 total time=  42.7s
[CV 3/5] END pca__n_components=20, rf__max_depth=10, rf__min_samples_split=10, rf__n_estimators=100;, score=0.892 total time=  42.7s
[CV 4/5] END pca__n_components=20, rf__max_depth=10, rf__min_samples_split=10, rf__n_estimators=100;, score=0.891 total time=  43.0s
[CV 5/5] END pca__n_components=20, rf__max_depth=10, rf__min_samples_split=10, rf__n_estimators=100;, score=0.893 total time=  42.7s
[CV 1/5] END pca__n_components=10, rf__max_depth=10, rf__min_samples_split=5, rf__n_estimators=100;, score=0.869 total time=  30.5s
[CV 2/5] END pca__n_components=10, rf__max_depth=10, rf__min_samples_split=5, rf__n_estimators=100;, score=0.870 total time=  30.4s
[CV 3/5] E

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [31]:
#print(get_scorer_names())

In [32]:
y_pred_rf

array([0, 1, 1, ..., 0, 0, 0])

In [33]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86     22550
           1       0.80      0.67      0.73     13267

    accuracy                           0.82     35817
   macro avg       0.81      0.79      0.80     35817
weighted avg       0.81      0.82      0.81     35817



In [34]:
B_report_rf = pd.DataFrame(classification_report(y_test, y_pred_rf, output_dict=True))

In [35]:
for i, name in enumerate(B_report_rf.columns):
  B_report_rf = B_report_rf.rename(columns={(B_report_rf.iloc[:,i].name): ('RF_'+B_report_rf.iloc[:,i].name)})


In [36]:
B_report_rf

Unnamed: 0,RF_0,RF_1,RF_accuracy,RF_macro avg,RF_weighted avg
precision,0.823551,0.798728,0.815814,0.811139,0.814356
recall,0.900355,0.672119,0.815814,0.786237,0.815814
f1-score,0.860242,0.729974,0.815814,0.795108,0.811989
support,22550.0,13267.0,0.815814,35817.0,35817.0


DecisionTreeClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [37]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)

pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['dtc', DecisionTreeClassifier()]])

    
param_grid = {'dtc__max_leaf_nodes' : [5, 30], 
             'dtc__max_depth': [10, 40],
             'dtc__random_state' : [23],
             'pca__n_components': [5, 10, 20]
             }

search_dtc = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,                           
                          verbose=3,
                           #n_jobs=3
                         )

search_dtc.fit(X_train, y_train)
y_pred_dtc = search_dtc.best_estimator_.predict(X_test)
cv_score = search_dtc.best_score_
test_score = search_dtc.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("Decision Tree:")
print(search_rf.best_params_)
print(f'Results on test: {search_rf.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_rf.best_estimator_.score(X_train, y_train)}')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=5;, score=0.725 total time=   2.6s
[CV 2/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=5;, score=0.726 total time=   2.5s
[CV 3/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=5;, score=0.730 total time=   2.5s
[CV 4/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=5;, score=0.730 total time=   2.5s
[CV 5/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=5;, score=0.729 total time=   2.5s
[CV 1/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=10;, score=0.728 total time=  12.4s
[CV 2/5] END dtc__max_depth=10, dtc__max_leaf_nodes=5, dtc__random_state=23, pca__n_components=10;, score=0.725 total time=  13.4s
[CV 3/5] END dtc__max_depth

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [38]:
y_pred_dtc

array([1, 1, 0, ..., 0, 1, 0])

In [39]:
print(classification_report(y_test, y_pred_dtc))
B_report_dtc = pd.DataFrame(classification_report(y_test, y_pred_dtc, output_dict=True))

              precision    recall  f1-score   support

           0       0.79      0.76      0.77     22550
           1       0.62      0.67      0.64     13267

    accuracy                           0.72     35817
   macro avg       0.71      0.71      0.71     35817
weighted avg       0.73      0.72      0.73     35817



In [40]:
for i, name in enumerate(B_report_dtc.columns):
  B_report_dtc = B_report_dtc.rename(columns={(B_report_dtc.iloc[:,i].name): ('DTC_'+B_report_dtc.iloc[:,i].name)})


In [41]:
B_report_dtc

Unnamed: 0,DTC_0,DTC_1,DTC_accuracy,DTC_macro avg,DTC_weighted avg
precision,0.793867,0.61685,0.723064,0.705359,0.728298
recall,0.756585,0.666089,0.723064,0.711337,0.723064
f1-score,0.774778,0.640525,0.723064,0.707651,0.725049
support,22550.0,13267.0,0.723064,35817.0,35817.0


Support Vector Classifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [42]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=23)

pipeline_SVC = imbpipeline([('scaler', scaler),
                            ('pca', pca),
                            ('SMOTE', SMOTEEN),
                            ('SVC', SVC())])
    
params_SVC = {
              'SVC__gamma': ['auto'],# [10, 20, 50]
              'SVC__max_iter': [150, 300],
              'SVC__decision_function_shape': ['ovo'],
              'SVC__degree': [1], #, 3, 5],
              'SVC__kernel': ['rbf'],
              'SVC__random_state': [11],
              'pca__n_components': [5, 10, 20]
             }

search_SVC = GridSearchCV(pipeline_SVC,
                             params_SVC,
                             scoring='roc_auc',
                             cv=stratified_kfold,
                            verbose=3,
                            #n_jobs=3
                         )

search_SVC.fit(X_train, y_train)

cv_score = search_SVC.best_score_
test_score = search_SVC.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("Support Vector:")
print(search_SVC.best_params_)
print(f'Results on test: {search_SVC.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_SVC.best_estimator_.score(X_train, y_train)}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=5;, score=0.610 total time=   3.5s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=5;, score=0.577 total time=   3.4s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=5;, score=0.616 total time=   3.4s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=5;, score=0.416 total time=   3.5s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=5;, score=0.544 total time=   3.4s




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=10;, score=0.618 total time=  12.7s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=10;, score=0.576 total time=  12.9s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=10;, score=0.614 total time=  13.3s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=10;, score=0.575 total time=  12.9s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=10;, score=0.591 total time=  12.4s




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=20;, score=0.631 total time=  20.4s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=20;, score=0.554 total time=  20.6s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=20;, score=0.661 total time=  20.6s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=20;, score=0.598 total time=  20.6s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=150, SVC__random_state=11, pca__n_components=20;, score=0.675 total time=  20.8s




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=5;, score=0.667 total time=   4.6s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=5;, score=0.447 total time=   4.6s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=5;, score=0.574 total time=   4.6s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=5;, score=0.489 total time=   4.5s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=5;, score=0.455 total time=   4.6s




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=10;, score=0.595 total time=  14.4s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=10;, score=0.452 total time=  14.0s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=10;, score=0.649 total time=  14.5s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=10;, score=0.618 total time=  13.9s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=10;, score=0.606 total time=  14.0s




[CV 1/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=20;, score=0.598 total time=  22.2s




[CV 2/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=20;, score=0.647 total time=  22.3s




[CV 3/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=20;, score=0.670 total time=  22.4s




[CV 4/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=20;, score=0.672 total time=  22.2s




[CV 5/5] END SVC__decision_function_shape=ovo, SVC__degree=1, SVC__gamma=auto, SVC__kernel=rbf, SVC__max_iter=300, SVC__random_state=11, pca__n_components=20;, score=0.635 total time=  22.2s




Cross-validation score: 0.6444381925783309
Test score: 0.5821769483891897
Support Vector:
{'SVC__decision_function_shape': 'ovo', 'SVC__degree': 1, 'SVC__gamma': 'auto', 'SVC__kernel': 'rbf', 'SVC__max_iter': 300, 'SVC__random_state': 11, 'pca__n_components': 20}
Results on test: 0.5926236144847419
Results on train: 0.5914350328455362


Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [43]:
y_pred_SVC_train = search_SVC.best_estimator_.predict(X_train)

In [44]:
y_pred_svc_test = search_SVC.best_estimator_.predict(X_test)

In [45]:
y_pred_SVC = search_SVC.predict(X_test)

In [46]:
search_SVC.best_params_

{'SVC__decision_function_shape': 'ovo',
 'SVC__degree': 1,
 'SVC__gamma': 'auto',
 'SVC__kernel': 'rbf',
 'SVC__max_iter': 300,
 'SVC__random_state': 11,
 'pca__n_components': 20}

In [47]:
print(classification_report(y_test, y_pred_SVC))
B_report_svc = pd.DataFrame(classification_report(y_test, y_pred_SVC, output_dict=True))

              precision    recall  f1-score   support

           0       0.65      0.76      0.70     22550
           1       0.43      0.31      0.36     13267

    accuracy                           0.59     35817
   macro avg       0.54      0.54      0.53     35817
weighted avg       0.57      0.59      0.58     35817



In [48]:
for i, name in enumerate(B_report_svc.columns):
  B_report_svc = B_report_svc.rename(columns={(B_report_svc.iloc[:,i].name): ('SVC_'+B_report_svc.iloc[:,i].name)})


In [49]:
B_report_svc

Unnamed: 0,SVC_0,SVC_1,SVC_accuracy,SVC_macro avg,SVC_weighted avg
precision,0.65193,0.431214,0.592624,0.541572,0.570174
recall,0.757251,0.312806,0.592624,0.535028,0.592624
f1-score,0.700654,0.362588,0.592624,0.531621,0.575431
support,22550.0,13267.0,0.592624,35817.0,35817.0


XGBClassifier algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [50]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=77)

pipeline = imbpipeline(steps=[('scaler', scaler),
                              ('pca', pca),
                              ('smote', SMOTEEN),
                              ('XGB', XGBClassifier())])

params = {
    'XGB__n_estimators': [100, 500],
    'XGB__max_depth': [5, 10],
    'XGB__learning_rate': [0.1, 0.5],
    'pca__n_components': [5, 10, 20]
    }

search_XGB = GridSearchCV(pipeline, 
                          params, 
                          scoring='roc_auc', 
                          cv=stratified_kfold, 
                          verbose=3,
                        #n_jobs=3
                         ) 

search_XGB.fit(X_train, y_train)
accuracy_score(y_test, search_XGB.predict(X_test))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.826 total time=   5.4s
[CV 2/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.831 total time=   5.4s
[CV 3/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.838 total time=   5.3s
[CV 4/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.832 total time=   5.3s
[CV 5/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.829 total time=   5.2s
[CV 1/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.862 total time=  16.9s
[CV 2/5] END XGB__learning_rate=0.1, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.876 total time=  16.5s
[CV 3/5] END XGB__l

[CV 3/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.846 total time=   5.3s
[CV 4/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.844 total time=   5.2s
[CV 5/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=5;, score=0.845 total time=   5.3s
[CV 1/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.887 total time=  16.1s
[CV 2/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.890 total time=  17.9s
[CV 3/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.891 total time=  16.8s
[CV 4/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, pca__n_components=10;, score=0.890 total time=  16.6s
[CV 5/5] END XGB__learning_rate=0.5, XGB__max_depth=5, XGB__n_estimators=100, p

0.831532512494067

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [51]:
#XGBClassifier().get_params().keys()

In [52]:
search_XGB.cv_results_["mean_test_score"]

array([0.83112947, 0.87007317, 0.8940661 , 0.84810114, 0.89376156,
       0.91357181, 0.85882674, 0.89929776, 0.9157148 , 0.86545448,
       0.90782276, 0.92126688, 0.843452  , 0.88967043, 0.90824629,
       0.85276862, 0.90091971, 0.91769372, 0.86226631, 0.90400104,
       0.9178607 , 0.86227291, 0.90589789, 0.91939579])

In [53]:
y_pred_XGB = search_XGB.best_estimator_.predict(X_test)
test_score = search_XGB.score(X_test, y_test)
cv_score = search_XGB.best_score_

In [54]:
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print("XGBClassifier:")
print(search_XGB.best_params_)
print(f'Results on test: {search_XGB.best_estimator_.score(X_test, y_test)}')
print(f'Results on train: {search_XGB.best_estimator_.score(X_train, y_train)}')

Cross-validation score: 0.9212668799026942
Test score: 0.8945331505392322
XGBClassifier:
{'XGB__learning_rate': 0.1, 'XGB__max_depth': 10, 'XGB__n_estimators': 500, 'pca__n_components': 20}
Results on test: 0.831532512494067
Results on train: 0.8844842233735776


In [55]:
print(classification_report(y_test, y_pred_XGB))
B_report_xgb = pd.DataFrame(classification_report(y_test, y_pred_XGB, output_dict=True))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87     22550
           1       0.81      0.71      0.76     13267

    accuracy                           0.83     35817
   macro avg       0.83      0.81      0.81     35817
weighted avg       0.83      0.83      0.83     35817



In [56]:
for i, name in enumerate(B_report_xgb.columns):
  B_report_xgb = B_report_xgb.rename(columns={(B_report_xgb.iloc[:,i].name): ('XGB_'+B_report_xgb.iloc[:,i].name)})


In [57]:
B_report_xgb

Unnamed: 0,XGB_0,XGB_1,XGB_accuracy,XGB_macro avg,XGB_weighted avg
precision,0.84124,0.811311,0.831533,0.826275,0.830154
recall,0.902794,0.710409,0.831533,0.806602,0.831533
f1-score,0.87093,0.757515,0.831533,0.814223,0.82892
support,22550.0,13267.0,0.831533,35817.0,35817.0


LogisticRegression algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [58]:
pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['LR', LogisticRegression()]])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)
    
param_grid = {'LR__C':[20, 70],
             'LR__random_state': [11],
             'LR__multi_class': ['auto'],
             'LR__max_iter': [50, 100],
             'LR__solver': ['saga'],
             'LR__penalty': ['l2', 'l1'],
             'pca__n_components': [5, 10, 20]
             }
                                                                 
search_LR = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           verbose=3,
                           #n_jobs=3
                        )

search_LR.fit(X_train, y_train)
cv_score = search_LR.best_score_
test_score = search_LR.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 2/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.4s
[CV 3/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.4s
[CV 4/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.4s
[CV 5/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 total time=   2.4s
[CV 1/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_stat



[CV 1/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  20.7s




[CV 2/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.821 total time=  20.7s




[CV 3/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  20.7s




[CV 4/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  20.7s




[CV 5/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  20.8s
[CV 1/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.771 total time=   2.4s
[CV 2/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 3/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775 total time=   2.3s
[CV 4/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.771 total time=   2.4s
[CV 5/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 tota



[CV 1/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  21.0s




[CV 2/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.821 total time=  20.9s




[CV 3/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  21.0s




[CV 4/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  21.0s




[CV 5/5] END LR__C=20, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  21.0s
[CV 1/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 2/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.770 total time=   2.4s
[CV 3/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.776 total time=   2.4s
[CV 4/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.771 total time=   2.4s
[CV 5/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775



[CV 1/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.827 total time=  22.3s




[CV 2/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.821 total time=  22.0s




[CV 3/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  22.4s




[CV 4/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  22.1s




[CV 5/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  22.2s
[CV 1/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.4s
[CV 2/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 total time=   2.4s
[CV 3/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775 total time=   2.4s
[CV 4/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775 total time=   2.3s
[CV 5/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.77



[CV 1/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.827 total time=  23.0s




[CV 2/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.822 total time=  23.0s




[CV 3/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  22.9s




[CV 4/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  23.0s




[CV 5/5] END LR__C=20, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  22.9s
[CV 1/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 total time=   2.4s
[CV 2/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 total time=   2.4s
[CV 3/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 total time=   2.3s
[CV 4/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.771 total time=   2.4s
[CV 5/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.774 tot



[CV 1/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  20.7s




[CV 2/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.821 total time=  20.6s




[CV 3/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  20.8s




[CV 4/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  20.7s




[CV 5/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  20.7s
[CV 1/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 2/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.771 total time=   2.3s
[CV 3/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.776 total time=   2.4s
[CV 4/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.5s
[CV 5/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 tota



[CV 1/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  21.0s




[CV 2/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.821 total time=  20.9s




[CV 3/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  21.1s




[CV 4/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  21.0s




[CV 5/5] END LR__C=70, LR__max_iter=50, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  21.0s
[CV 1/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 2/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775 total time=   2.4s
[CV 3/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.776 total time=   2.4s
[CV 4/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 5/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.776



[CV 1/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.827 total time=  22.3s




[CV 2/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.822 total time=  22.1s




[CV 3/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  22.3s




[CV 4/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  22.2s




[CV 5/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l2, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  22.3s
[CV 1/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.773 total time=   2.4s
[CV 2/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.5s
[CV 3/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.775 total time=   2.4s
[CV 4/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.772 total time=   2.4s
[CV 5/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=5;, score=0.76



[CV 1/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  22.8s




[CV 2/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.822 total time=  22.9s




[CV 3/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.824 total time=  22.9s




[CV 4/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.825 total time=  23.0s




[CV 5/5] END LR__C=70, LR__max_iter=100, LR__multi_class=auto, LR__penalty=l1, LR__random_state=11, LR__solver=saga, pca__n_components=20;, score=0.826 total time=  23.1s
Cross-validation score: 0.8247309624523392
Test score: 0.818079692590371




Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [59]:
search_LR.best_params_

{'LR__C': 20,
 'LR__max_iter': 100,
 'LR__multi_class': 'auto',
 'LR__penalty': 'l1',
 'LR__random_state': 11,
 'LR__solver': 'saga',
 'pca__n_components': 20}

In [60]:
y_pred_lr = search_LR.best_estimator_.predict(X_test)

In [61]:
test_score = search_LR.score(X_test, y_test)

In [62]:
print(classification_report(y_test, y_pred_lr))
B_report_lr = pd.DataFrame(classification_report(y_test, y_pred_lr, output_dict=True))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81     22550
           1       0.69      0.65      0.67     13267

    accuracy                           0.76     35817
   macro avg       0.74      0.74      0.74     35817
weighted avg       0.76      0.76      0.76     35817



In [63]:
for i, name in enumerate(B_report_lr.columns):
  B_report_lr = B_report_lr.rename(columns={(B_report_lr.iloc[:,i].name): ('LR_'+B_report_lr.iloc[:,i].name)})


In [64]:
B_report_lr

Unnamed: 0,LR_0,LR_1,LR_accuracy,LR_macro avg,LR_weighted avg
precision,0.80006,0.687425,0.760672,0.743743,0.758339
recall,0.826386,0.648979,0.760672,0.737682,0.760672
f1-score,0.81301,0.667649,0.760672,0.740329,0.759167
support,22550.0,13267.0,0.760672,35817.0,35817.0


Utilizing Multi Layer Perceptron algorythm with RandomizedGridSearch in pipeline, scaling reducing, ballancing:

In [65]:
pipeline = imbpipeline(steps = [['scaler', scaler],
                                ['pca', pca],
                                ['smote', SMOTEEN],
                                ['MLP', MLPClassifier()]])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=13)
    
param_grid = {'MLP__hidden_layer_sizes':[8, 16],
             'MLP__activation': ['relu'],
              'MLP__solver': ['adam'],
              'MLP__random_state': [42],
              'MLP__max_iter': [1000],
              'MLP__batch_size': [32],
              'pca__n_components': [5, 10, 20]
             }
                                                                 
search_MLP = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           verbose=3,
                           #n_jobs=3
                        )

search_MLP.fit(X_train, y_train)
cv_score = search_MLP.best_score_
test_score = search_MLP.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END MLP__activation=relu, MLP__batch_size=32, MLP__hidden_layer_sizes=8, MLP__max_iter=1000, MLP__random_state=42, MLP__solver=adam, pca__n_components=5;, score=0.801 total time=  28.5s
[CV 2/5] END MLP__activation=relu, MLP__batch_size=32, MLP__hidden_layer_sizes=8, MLP__max_iter=1000, MLP__random_state=42, MLP__solver=adam, pca__n_components=5;, score=0.803 total time=  28.0s
[CV 3/5] END MLP__activation=relu, MLP__batch_size=32, MLP__hidden_layer_sizes=8, MLP__max_iter=1000, MLP__random_state=42, MLP__solver=adam, pca__n_components=5;, score=0.804 total time=  19.7s
[CV 4/5] END MLP__activation=relu, MLP__batch_size=32, MLP__hidden_layer_sizes=8, MLP__max_iter=1000, MLP__random_state=42, MLP__solver=adam, pca__n_components=5;, score=0.810 total time=  31.6s
[CV 5/5] END MLP__activation=relu, MLP__batch_size=32, MLP__hidden_layer_sizes=8, MLP__max_iter=1000, MLP__random_state=42, MLP__solver=adam, pca__n_components=

Achieving scores of classification, saving accuracy, recall and F1 score in data frame:

In [66]:
y_pred_mlp = search_MLP.predict(X_test)
print(classification_report(y_test, y_pred_mlp))
B_report_mlp = pd.DataFrame(classification_report(y_test, y_pred_mlp, output_dict=True))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86     22550
           1       0.79      0.71      0.75     13267

    accuracy                           0.82     35817
   macro avg       0.82      0.80      0.81     35817
weighted avg       0.82      0.82      0.82     35817



In [67]:
for i, name in enumerate(B_report_mlp.columns):
  B_report_mlp = B_report_mlp.rename(columns={(B_report_mlp.iloc[:,i].name): ('MLP_'+B_report_mlp.iloc[:,i].name)})


In [68]:
B_report_mlp

Unnamed: 0,MLP_0,MLP_1,MLP_accuracy,MLP_macro avg,MLP_weighted avg
precision,0.837864,0.792663,0.822933,0.815264,0.821121
recall,0.89122,0.706867,0.822933,0.799043,0.822933
f1-score,0.863718,0.747311,0.822933,0.805514,0.8206
support,22550.0,13267.0,0.822933,35817.0,35817.0


Creating Data Frame containing all six classifiers results:

In [70]:
B_results = pd.concat([B_report_rf, 
                       B_report_dtc, 
                       B_report_svc, 
                       B_report_xgb, 
                       B_report_lr, 
                       B_report_mlp], 
                      axis=1)

In [73]:
B_results

Unnamed: 0,RF_0,RF_1,RF_accuracy,RF_macro avg,RF_weighted avg,DTC_0,DTC_1,DTC_accuracy,DTC_macro avg,DTC_weighted avg,...,LR_0,LR_1,LR_accuracy,LR_macro avg,LR_weighted avg,MLP_0,MLP_1,MLP_accuracy,MLP_macro avg,MLP_weighted avg
precision,0.823551,0.798728,0.815814,0.811139,0.814356,0.793867,0.61685,0.723064,0.705359,0.728298,...,0.80006,0.687425,0.760672,0.743743,0.758339,0.837864,0.792663,0.822933,0.815264,0.821121
recall,0.900355,0.672119,0.815814,0.786237,0.815814,0.756585,0.666089,0.723064,0.711337,0.723064,...,0.826386,0.648979,0.760672,0.737682,0.760672,0.89122,0.706867,0.822933,0.799043,0.822933
f1-score,0.860242,0.729974,0.815814,0.795108,0.811989,0.774778,0.640525,0.723064,0.707651,0.725049,...,0.81301,0.667649,0.760672,0.740329,0.759167,0.863718,0.747311,0.822933,0.805514,0.8206
support,22550.0,13267.0,0.815814,35817.0,35817.0,22550.0,13267.0,0.723064,35817.0,35817.0,...,22550.0,13267.0,0.760672,35817.0,35817.0,22550.0,13267.0,0.822933,35817.0,35817.0


Saving results in a file:

In [74]:
B_results.to_pickle("data/B_dataset_results.pkl")

Loading and presenting saved Data Frame:

In [75]:
B_results = pd.read_pickle("data/B_dataset_results.pkl")

NameError: name 'pickle' is not defined

# Summary

In [76]:
c_list = ["RF", "DTC", "SVC", "XGB", "LR", "MLP"]

Loading results from not binned dataset from pickle file:

In [77]:
A_results = pd.read_pickle("data/A_dataset_results.pkl")

In [78]:
A_results

Unnamed: 0,RF_0,RF_1,RF_accuracy,RF_macro avg,RF_weighted avg,DTC_0,DTC_1,DTC_accuracy,DTC_macro avg,DTC_weighted avg,...,LR_0,LR_1,LR_accuracy,LR_macro avg,LR_weighted avg,MLP_0,MLP_1,MLP_accuracy,MLP_macro avg,MLP_weighted avg
precision,0.817547,0.792963,0.809979,0.805255,0.808441,0.797431,0.648731,0.741491,0.723081,0.742351,...,0.832342,0.6582,0.760365,0.745271,0.767838,0.843845,0.75592,0.812435,0.799883,0.811277
recall,0.898758,0.659079,0.809979,0.778919,0.809979,0.790111,0.658853,0.741491,0.724482,0.741491,...,0.77561,0.734454,0.760365,0.755032,0.760365,0.861508,0.729027,0.812435,0.795267,0.812435
f1-score,0.856232,0.719849,0.809979,0.78804,0.805714,0.793754,0.653753,0.741491,0.723753,0.741896,...,0.802975,0.69424,0.760365,0.748607,0.762698,0.852585,0.74223,0.812435,0.797407,0.811708
support,22550.0,13267.0,0.809979,35817.0,35817.0,22550.0,13267.0,0.741491,35817.0,35817.0,...,22550.0,13267.0,0.760365,35817.0,35817.0,22550.0,13267.0,0.812435,35817.0,35817.0


**Best model before binning data**

Best f1-score:

In [91]:
predi_0s = A_results.filter(like='0')
max_f1 = predi_0s[predi_0s.values==(predi_0s.loc["f1-score",:]).max()]
max_0s = predi_0s[max_f1.idxmax(axis=1)]

predi_ones = A_results.filter(like='1')
max_f1 = predi_ones[predi_ones.values==(predi_ones.loc["f1-score",:]).max()]
max_ones_A = predi_ones[max_f1.idxmax(axis=1)]
print(max_0s)
print(max_ones_A)

                  XGB_0
precision      0.826087
recall         0.899024
f1-score       0.861014
support    22550.000000
                  MLP_1
precision      0.755920
recall         0.729027
f1-score       0.742230
support    13267.000000


Predicting hotel guests who cancel their reservation I discovered that from all six models with given hyperparameters one achieved highest f1-score witch is a harmonic mean between precision and recall. Shows how precise model managed to fit into given dataset. Precision tells how acurate model was predicting cancelation(1). And recall metric tells how well model made it on test data. In scoretable above is shown that model predicted only 73% true cancelations right, when its precision score was on level of 76%. To increase effectiveness I should consider do more experiments with hyperparameters of winning model and models with close scores to it, in this case Multi Layer Perceptron is the winner.
In situation when consider 0's(not canceled) there is overfitting, precision number is smaller than recall what tells me that model done it better on test data than whole set. In future to avoid overfitting I'll use regularization methods, in the case of XGBClassifier there are 3 hyperparameters to tune: alpha: l1 regularization, gamma: minimum loss reduction, lambda: l2 regularization.

Best precision score:

In [80]:
predi_0s = A_results.filter(like='0')
max_prec = predi_0s[predi_0s.values==(predi_0s.loc["precision",:]).max()]
max_0s = predi_0s[max_prec.idxmax(axis=1)]

predi_ones = A_results.filter(like='1')
max_prec = predi_ones[predi_ones.values==(predi_ones.loc["precision",:]).max()]
max_ones = predi_ones[max_prec.idxmax(axis=1)]
print(max_0s)
print(max_ones)

                  MLP_0
precision      0.843845
recall         0.861508
f1-score       0.852585
support    22550.000000
                  XGB_1
precision      0.798067
recall         0.678300
f1-score       0.733325
support    13267.000000


In [81]:
B_results

Unnamed: 0,RF_0,RF_1,RF_accuracy,RF_macro avg,RF_weighted avg,DTC_0,DTC_1,DTC_accuracy,DTC_macro avg,DTC_weighted avg,...,LR_0,LR_1,LR_accuracy,LR_macro avg,LR_weighted avg,MLP_0,MLP_1,MLP_accuracy,MLP_macro avg,MLP_weighted avg
precision,0.823551,0.798728,0.815814,0.811139,0.814356,0.793867,0.61685,0.723064,0.705359,0.728298,...,0.80006,0.687425,0.760672,0.743743,0.758339,0.837864,0.792663,0.822933,0.815264,0.821121
recall,0.900355,0.672119,0.815814,0.786237,0.815814,0.756585,0.666089,0.723064,0.711337,0.723064,...,0.826386,0.648979,0.760672,0.737682,0.760672,0.89122,0.706867,0.822933,0.799043,0.822933
f1-score,0.860242,0.729974,0.815814,0.795108,0.811989,0.774778,0.640525,0.723064,0.707651,0.725049,...,0.81301,0.667649,0.760672,0.740329,0.759167,0.863718,0.747311,0.822933,0.805514,0.8206
support,22550.0,13267.0,0.815814,35817.0,35817.0,22550.0,13267.0,0.723064,35817.0,35817.0,...,22550.0,13267.0,0.760672,35817.0,35817.0,22550.0,13267.0,0.822933,35817.0,35817.0


**Best model after binning data**

Model with highest f1-score:

In [92]:
predi_0s = B_results.filter(like='0')
max_f1 = predi_0s[predi_0s.values==(predi_0s.loc["f1-score",:]).max()]
max_0s = predi_0s[max_f1.idxmax(axis=1)]

predi_ones = B_results.filter(like='1')
max_f1 = predi_ones[predi_ones.values==(predi_ones.loc["f1-score",:]).max()]
max_ones_B = predi_ones[max_f1.idxmax(axis=1)]
print(max_0s)
print(max_ones_B)

                  XGB_0
precision      0.841240
recall         0.902794
f1-score       0.870930
support    22550.000000
                  XGB_1
precision      0.811311
recall         0.710409
f1-score       0.757515
support    13267.000000


Recall shows that model achieved 71% efficiency in predicting real cancelations, when on training data it's score was 81% right predicted cancelations.
After binning data, XGBClassifier has the best f1-score again.

On 0's prediction model is overfitted, need to add gamma, alpha or lambda punishment hyperparameters to improve tuning of the algorithm.

Model with highest precision:

In [83]:
predi_0s = B_results.filter(like='0')
max_prec = predi_0s[predi_0s.values==(predi_0s.loc["precision",:]).max()]
max_0s = predi_0s[max_prec.idxmax(axis=1)]

predi_ones = B_results.filter(like='1')
max_prec = predi_ones[predi_ones.values==(predi_ones.loc["precision",:]).max()]
max_ones = predi_ones[max_prec.idxmax(axis=1)]
print(max_0s)
print(max_ones)

                  XGB_0
precision      0.841240
recall         0.902794
f1-score       0.870930
support    22550.000000
                  XGB_1
precision      0.811311
recall         0.710409
f1-score       0.757515
support    13267.000000


**Final comparison**

In [97]:
f1_ones = pd.concat([max_ones_A, max_ones_B], axis=1)

In [109]:
f1_names = {f1_ones.columns[0]: "Not_binned_"+f1_ones.columns[0], 
            f1_ones.columns[1]: "Binned_"+f1_ones.columns[1]}

In [110]:
f1_ones.rename(columns=f1_names)

Unnamed: 0,Not_binned_MLP_1,Binned_XGB_1
precision,0.75592,0.811311
recall,0.729027,0.710409
f1-score,0.74223,0.757515
support,13267.0,13267.0


In [111]:
recall_dif = f1_ones.iloc[1, 1] - f1_ones.iloc[1, 0]

In [112]:
f1_dif = f1_ones.iloc[2, 1] - f1_ones.iloc[2, 0] 

In [119]:
if recall_dif > 0:
    diff_r = "higher"
else:
    diff_r = "lower"
if f1_dif > 0:
    diff_f = "higher"
else:
    diff_f = "lower"    

In [124]:
print("Binning let me attain recall score slightly", diff_r," (by", round(np.abs(recall_dif),4)*100,"% ) and f1", diff_f," by ",round(np.abs(f1_dif),4)*100,"%")

Binning let me attain recall score slightly higher  (by 1.53 % ) and f1 higher  by  1.53 %
