In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from bokeh.plotting import output_notebook, show
output_notebook()

In [2]:
base = pd.read_csv("HR_comma_sep.csv") #source = https://www.kaggle.com/liujiaqi/hr-comma-sepcsv/version/1

In [3]:
base.shape

(14999, 10)

###### We'll use the same file to simulate the following datasets
Train: the data we use to train and validate the models<br>
holdout_data: the final test set to judge the real world model <br>
new_data: new data the model will be applied on in production.

In [4]:
train = base.sample(10000)
holdout_data = base[~base.index.isin(train.index)]
new_data = holdout_data.sample(2000).drop("left", axis=1) # removing the target as this won't have it in real use cases
holdout_data = holdout_data[~holdout_data.index.isin(new_data.index)]

In [5]:
train.shape
holdout_data.shape
new_data.shape

(10000, 10)

(2999, 10)

(2000, 9)

In [6]:
import Model_selection, importlib

###### The following are the parameters used during the class instantiation
modelBase: the training data<br>

target: the target class to predict - must be an integer<br>

scoring: any one of scikit learn's scorers - http://scikit-learn.org/stable/modules/model_evaluation.html<br>

n_jobs: number of CPU cores to use<br>

learning_curve: if true, scikit learn's learning curve will be used to determine the ideal training set size for cross validation<br>

automated: if false, we will have to call functions apply_models and evaluate_models to get the performance characteristics<br>

models: the models we want to test. we can use any or all of gbm, svm, random forest, logistic regression and naive_bayes<br>

categorical_columns: a list of columns that will be used as categorical. If the columns are integer but need to be used as a category, python may change the type at times, so the data will be padded with an underscore 

In [7]:
ms = Model_selection.model_train(modelBase=train, target="left", scoring="neg_log_loss", n_jobs=-1, learning_curve=True, automated=True, models=["gbm", "random_forest", "svm", "logistic_regression"], categorical_columns=["Work_accident", "promotion_last_5years"])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.4min finished


gbm best CV score: -0.09654800561851402
gbm fitting complete

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.8min finished


random_forest best CV score: -0.18162603617050307
random_forest fitting complete

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.1min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 21.6min finished


svm best CV score: -0.11417308320612672
svm fitting complete

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   27.0s finished


logistic_regression best CV score: -0.4366707680398868
logistic_regression fitting complete

gbm Training accuracy:98.85%
gbm Test accuracy:97.7%
gbm Training f1_score:97.6%
gbm Test f1_score:95.35%

random_forest Training accuracy:93.96%
random_forest Test accuracy:93.6%
random_forest Training f1_score:87.73%
random_forest Test f1_score:87.52%

svm Training accuracy:99.84%
svm Test accuracy:97.35%
svm Training f1_score:99.66%
svm Test f1_score:94.67%

logistic_regression Training accuracy:79.36%
logistic_regression Test accuracy:78.9%
logistic_regression Training f1_score:46.66%
logistic_regression Test f1_score:47.9%



###### Selecting and retraining the model
One or more than one models can be selected and they will be trained on the full train/validation split used above to test on the final holdout set<br>

Multiple holdout sets can be added to the function test_on_holdout_data and the performance plots and tables will bet updated accordingly<br>

The plots will be a part of the dictionary ms.plots<br>
The tables used to calculate the KS and Gini Coeficients will be a part of the dictionary ms.Gini_table

In [8]:
ms.select_retrain_model(["random_forest"])

In [9]:
ms.test_on_holdout_data(X=holdout_data.loc[:, holdout_data.columns!=ms.target], y = holdout_data.loc[:, ms.target])

Re-fitting on the full data...


###### Saving the final model objects 
The following will be saved as pickle objects. They can be loaded and used on new data using the model_apply method<br>

DictVectorizer.pkl: the dictvectorizer object<br>
feature_levels.pkl: the levels of all object variables<br>
Categorical_columns.pkl: all columns specified as categorical <br>
model.pkl: the final model to be applied to the test set (name will vary based on the model)

In [12]:
ms.save_model_objects()

###### Classifying new data
Make sure that the objects are in the relevant folder or that the paths are provided correcty. The model can predict the class probabilies as well as the labels

In [13]:
predictions_probabilities = Model_selection.model_apply(new_data, model="random_forest.pkl", predict_proba=True)
predictions = Model_selection.model_apply(new_data, model="random_forest.pkl", predict_proba=False)

In [14]:
predictions_probabilities
predictions

array([[0.92343562, 0.07656438],
       [0.99076668, 0.00923332],
       [0.9606574 , 0.0393426 ],
       ...,
       [0.27138184, 0.72861816],
       [0.92343562, 0.07656438],
       [0.99076668, 0.00923332]])

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

###### Performance characteristics saved inside the class

In [15]:
ms.Gini_table["random_forest_test"] # the validation base

Unnamed: 0,Event,Total,Probability_of_Event,Non_event,Cumulative_Non_event,Cumulative_Event,Population_%,Cumulative_Non_event_%,Cumulative_Event_%,Difference,Event_rate,Gini
8,171,172,"(0.891, 1.0]",1,1,171,0.086,0.000666,0.343373,0.342708,0.994186,0.000114
7,160,183,"(0.728, 0.891]",23,24,331,0.0915,0.015979,0.664659,0.64868,0.874317,0.007718
6,140,273,"(0.133, 0.728]",133,157,471,0.1365,0.104527,0.945783,0.841256,0.512821,0.071301
5,15,231,"(0.0816, 0.133]",216,373,486,0.1155,0.248336,0.975904,0.727568,0.064935,0.138177
4,7,154,"(0.032, 0.0816]",147,520,493,0.077,0.346205,0.98996,0.643755,0.045455,0.096199
3,1,199,"(0.0092, 0.032]",198,718,494,0.0995,0.478029,0.991968,0.513939,0.005025,0.130633
2,1,267,"(0.00899, 0.0092]",266,984,495,0.1335,0.655126,0.993976,0.338849,0.003745,0.175853
1,0,0,"(0.00895, 0.00899]",0,984,495,0.0,0.655126,0.993976,0.338849,,0.0
0,3,521,"(-0.001, 0.00895]",518,1502,498,0.2605,1.0,1.0,0.0,0.005758,0.343835
9,498,2000,All,1502,1502,498,1.0,1.0,1.0,0.0,0.249,


In [16]:
ms.Gini_table["random_forest_test1"] # the first holdout test base

Unnamed: 0,Event,Total,Probability_of_Event,Non_event,Cumulative_Non_event,Cumulative_Event,Population_%,Cumulative_Non_event_%,Cumulative_Event_%,Difference,Event_rate,Gini
8,346,365,"(0.891, 1.0]",19,19,346,0.121707,0.008243,0.498559,0.490316,0.947945,0.002055
7,131,158,"(0.728, 0.891]",27,46,477,0.052684,0.019957,0.68732,0.667363,0.829114,0.006945
6,168,375,"(0.133, 0.728]",207,253,645,0.125042,0.109761,0.929395,0.819633,0.448,0.072594
5,3,13,"(0.0816, 0.133]",10,263,648,0.004335,0.1141,0.933718,0.819618,0.230769,0.004041
4,26,538,"(0.032, 0.0816]",512,775,674,0.179393,0.336226,0.971182,0.634956,0.048327,0.211564
3,4,170,"(0.0092, 0.032]",166,941,678,0.056686,0.408243,0.976945,0.568702,0.023529,0.070149
2,0,0,"(0.00899, 0.0092]",0,941,678,0.0,0.408243,0.976945,0.568702,,0.0
1,0,0,"(0.00895, 0.00899]",0,941,678,0.0,0.408243,0.976945,0.568702,,0.0
0,16,1380,"(-0.001, 0.00895]",1364,2305,694,0.460153,1.0,1.0,0.0,0.011594,0.584936
9,694,2999,All,2305,2305,694,1.0,1.0,1.0,0.0,0.23141,


##### Lorenz curves

In [17]:
show(ms.plots["random_forest"])

###### KS/Gini Coefficients

In [18]:
print(f"Random Forest training Gini:{ms.Gini['random_forest_train']:{4}.{4}}%")
print(f"Random Forest test Gini:{ms.Gini['random_forest_test']:{4}.{4}}%")
print(f"Random Forest holdout sample Gini:{ms.Gini['random_forest_test1']:{4}.{4}}%")

Random Forest training Gini:92.4%
Random Forest test Gini:92.77%
Random Forest holdout sample Gini:90.46%


In [19]:
print(f"Random Forest training KS:{ms.KS['random_forest_train']:{4}.{4}}%")
print(f"Random Forest test KS:{ms.KS['random_forest_test']:{4}.{4}}%")
print(f"Random Forest holdout sample KS:{ms.KS['random_forest_test1']:{4}.{4}}%")

Random Forest training KS:83.97%
Random Forest test KS:84.13%
Random Forest holdout sample KS:81.96%
