In [None]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# model
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, recall_score, precision_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import shap

import sys
sys.path.append('../src/')
import model_pipeline

In [34]:
ohe_mode_allcat  = pd.read_csv("../data/ohe_mode_allcat.csv")

In [35]:
ohe_mode_allcat.drop(columns=['Unnamed: 0'], inplace=True)

In [37]:
ohe_mode_allcat  = ohe_mode_allcat.rename(columns={'coupon_Restaurant(<20)':'coupon_Restaurant(less_than_20)'})

Unnamed: 0,destination_Home,destination_No Urgent Place,destination_Work,passanger_Alone,passanger_Friend(s),passanger_Kid(s),passanger_Partner,weather_Rainy,weather_Snowy,weather_Sunny,...,toCoupon_GEQ5min_1,toCoupon_GEQ15min_0,toCoupon_GEQ15min_1,toCoupon_GEQ25min_0,toCoupon_GEQ25min_1,direction_same_0,direction_same_1,direction_opp_0,direction_opp_1,Y
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0


In [13]:
mode_allcat = pd.read_csv("../data/mode_allcat.csv")
mode_ordinal = pd.read_csv("../data/mode_ordinal.csv")
nan_allcat = pd.read_csv("../data/nan_allcat.csv")

In [14]:
# change column name for xgboost
mode_allcat = mode_allcat.rename(columns={'coupon_Restaurant(<20)':'coupon_Restaurant(less_than_20)'})
mode_ordinal = mode_ordinal.rename(columns={'coupon_Restaurant(<20)':'coupon_Restaurant(less_than_20)'})
nan_allcat = nan_allcat.rename(columns={'coupon_Restaurant(<20)':'coupon_Restaurant(less_than_20)'})

In [15]:
mode_ordinal.shape

(12684, 89)

In [16]:
datasets = [("mode_allcat",mode_allcat),("mode_ordinal",mode_ordinal),('nan_allcat',nan_allcat)]

In [20]:
dt = DecisionTreeClassifier()
dt_params = {"max_depth":[10], "min_samples_leaf":[12], "min_samples_split":[80], "random_state":[7], "splitter":['random']}

rf = RandomForestClassifier()
rf_params= {"max_depth":[8], "min_samples_leaf":[7], "min_samples_split":[8], "n_estimators":[500], "random_state":[7]}

xgb = XGBClassifier()
xgb_params = {"eta":[.3],"gamma":[1],"max_depth":[7],"lambda":[0], "alpha":[1]}

nn = MLPClassifier()
nn_params = {"hidden_layer_sizes":[(400,)], "learning_rate_init":[0.01], "max_iter":[500],
              "random_state":[7], "shuffle":[False]}

lr = LogisticRegression()
lr_params = {"C":[1,5], "max_iter":[1000], "penalty":['l1'], "random_state":[7],"solver":['liblinear']}

catb = CatBoostClassifier()
cat_params = {'depth': [1,7], 'iterations': [200], 'l2_leaf_reg': [5], 'learning_rate': [0.1]}

In [41]:
res = []
key_modified_params = {f"model__{key}": val for key, val in xgb_params.items()}
output = model_pipeline.imb_pipe_fit(xgb, key_modified_params, 'ohe_mode_allcat', ohe_mode_allcat.drop(columns='Y'), ohe_mode_allcat["Y"], score='roc_auc', scaler=True)
res.append(output)

fitting model...


In [21]:
# refit model for decision tree
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in dt_params.items()}
    output = model_pipeline.imb_pipe_fit(dt, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
fitting model...
fitting model...


In [24]:
res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

(0.7307617527660086, 0.7372665438033172, 0.7382605336322586)

In [26]:
# refit model for random forest
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in rf_params.items()}
    output = model_pipeline.imb_pipe_fit(rf, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
fitting model...
fitting model...


In [27]:
res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

(0.7645342909074788, 0.7617755653930678, 0.7625203452840108)

In [29]:
# refit model for xgboost
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in xgb_params.items()}
    output = model_pipeline.imb_pipe_fit(xgb, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
fitting model...
fitting model...


In [30]:
res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

(0.8337506887314042, 0.8320825337715881, 0.8334371971956758)

In [36]:
# refit model for neural network
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in nn_params.items()}
    output = model_pipeline.imb_pipe_fit(nn, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
fitting model...
fitting model...


In [37]:
res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

(0.799376183509712, 0.7732740549338503, 0.7765558996573759)

In [28]:
# refit model for logistic regression
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in lr_params.items()}
    output = model_pipeline.imb_pipe_fit(lr, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
fitting model...
fitting model...


In [29]:
type(res[0]['model'].best_estimator_.named_steps.model).__name__,res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

('LogisticRegression',
 0.7296334998955027,
 0.7291205137461289,
 0.7294580712987416)

In [22]:
# refit model for cat_boost
res = []
for data in datasets:
    key_modified_params = {f"model__{key}": val for key, val in cat_params.items()}
    output = model_pipeline.imb_pipe_fit(catb, key_modified_params, data[0], data[1].drop(columns='Y'), data[1]["Y"], score='roc_auc', scaler=True)
    res.append(output)

fitting model...
0:	learn: 0.6886443	total: 60.4ms	remaining: 12s
0:	learn: 0.6882619	total: 61.2ms	remaining: 12.2s
0:	learn: 0.6884935	total: 60.4ms	remaining: 12s
0:	learn: 0.6885026	total: 60.5ms	remaining: 12s
0:	learn: 0.6884893	total: 59.6ms	remaining: 11.9s
1:	learn: 0.6847935	total: 62ms	remaining: 6.14s
1:	learn: 0.6847905	total: 61.2ms	remaining: 6.06s
1:	learn: 0.6852249	total: 62.2ms	remaining: 6.15s
1:	learn: 0.6852177	total: 62.5ms	remaining: 6.19s
0:	learn: 0.6886359	total: 60.4ms	remaining: 12s
1:	learn: 0.6849908	total: 63.7ms	remaining: 6.31s
0:	learn: 0.6884611	total: 64.6ms	remaining: 12.9s
2:	learn: 0.6818104	total: 62.7ms	remaining: 4.11s
2:	learn: 0.6814054	total: 65.2ms	remaining: 4.28s
2:	learn: 0.6816690	total: 64.8ms	remaining: 4.25s
2:	learn: 0.6816656	total: 64.9ms	remaining: 4.26s
2:	learn: 0.6812282	total: 65.2ms	remaining: 4.28s
3:	learn: 0.6791968	total: 66.7ms	remaining: 3.27s
1:	learn: 0.6853675	total: 63.6ms	remaining: 6.29s
3:	learn: 0.6792656	tota

In [27]:
type(res[0]['model'].best_estimator_.named_steps.model).__name__,res[0]['test_score'],res[1]['test_score'],res[2]['test_score']

('CatBoostClassifier',
 0.8332795014534607,
 0.8290362826870341,
 0.8331553714716369)

In [30]:
d = {'dataset':["mode_allcat","mode_ordinal","nan_allcat"],
     "Logistic_Regression":[0.7296334998955027,0.7291205137461289,0.7294580712987416],
     'Decision_tree':[0.7307617527660086, 0.7372665438033172, 0.7382605336322586],
     'Random_Forest':[0.7645342909074788, 0.7617755653930678, 0.7625203452840108],
     "Xgboost":[0.8337506887314042, 0.8320825337715881, 0.8334371971956758],
     "MLP_classifier":[0.799376183509712, 0.7732740549338503, 0.7765558996573759],
     "Catboost_classifier":[0.8332795014534607,0.8290362826870341,0.8331553714716369],
     }
results = pd.DataFrame(d)


In [31]:
results

Unnamed: 0,dataset,Logistic_Regression,Decision_tree,Random_Forest,Xgboost,MLP_classifier,Catboost_classifier
0,mode_allcat,0.729633,0.730762,0.764534,0.833751,0.799376,0.83328
1,mode_ordinal,0.729121,0.737267,0.761776,0.832083,0.773274,0.829036
2,nan_allcat,0.729458,0.738261,0.76252,0.833437,0.776556,0.833155


In [37]:
import glob
import joblib

In [51]:
fit_time = pd.DataFrame(columns=['LogisticRegression','RandomForestClassifier','MLPClassifier','DecisionTreeClassifier','XGBClassifier','CatBoostClassifier'],
                  index=['mode_allcat','mode_ordinal','nan_allcat'])
fit_time

Unnamed: 0,LogisticRegression,RandomForestClassifier,MLPClassifier,DecisionTreeClassifier,XGBClassifier,CatBoostClassifier
mode_allcat,,,,,,
mode_ordinal,,,,,,
nan_allcat,,,,,,


In [53]:
for file in glob.glob('models/*.pkl'):
    model = joblib.load(file)
    model_name, dset = file.split("/")[1].split("_",1)
    dset = dset.split(".")[0]
    t = model.cv_results_['mean_fit_time'].mean()
    print(model_name, dset, t)
    fit_time.loc[dset,model_name] = t

LogisticRegression nan_allcat 5.456310554345449
RandomForestClassifier mode_ordinal 3.0760793924331664
MLPClassifier nan_allcat 8.612499968210857
DecisionTreeClassifier mode_ordinal 0.09679619471232097
XGBClassifier mode_allcat 6.983638230959574
RandomForestClassifier mode_allcat 3.337615219751994
CatBoostClassifier mode_ordinal 1.2886183619499207
RandomForestClassifier nan_allcat 3.5415223677953085
MLPClassifier mode_allcat 6.03398056825002
XGBClassifier mode_ordinal 6.200658384958903
XGBClassifier nan_allcat 8.89494001865387
LogisticRegression mode_ordinal 2.2386078675587973
MLPClassifier mode_ordinal 6.412512048085531
LogisticRegression mode_allcat 5.4254768729209895
DecisionTreeClassifier mode_allcat 0.13196926911671955
XGBClassifier ohe_mode_allcat 6.753404490152994
CatBoostClassifier mode_allcat 1.4491345365842183
DecisionTreeClassifier nan_allcat 0.11217442353566488
CatBoostClassifier nan_allcat 1.3068349639574688


In [54]:
fit_time

Unnamed: 0,LogisticRegression,RandomForestClassifier,MLPClassifier,DecisionTreeClassifier,XGBClassifier,CatBoostClassifier
mode_allcat,5.425477,3.337615,6.033981,0.131969,6.983638,1.449135
mode_ordinal,2.238608,3.076079,6.412512,0.096796,6.200658,1.288618
nan_allcat,5.456311,3.541522,8.6125,0.112174,8.89494,1.306835
ohe_mode_allcat,,,,,6.753404,
