In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

from models.monotonic import *

In [2]:
pd.options.display.max_rows = 6

### 1. Read data from file

In [3]:
data = pd.read_excel('../data/bank.xlsx')
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### 2. Statistics collection from 'numeric' and 'categorical' feature

In [4]:
Y = (data['y'] == 'yes').astype(int).values
X_num = data['balance'].values
X_cat = data['job'].values
example_num = create_stats(X_num, Y)
example_cat = create_stats(X_cat, Y, 'categorical')

In [5]:
example_num

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,VAR,"(-3313.001, -162.0]",-3313.0,-162.0,227,25,0.110132,202,0.889868,1.00,0.05050,-2.985763,61.714854
1,VAR,"(-162.0, 0.0]",-154.0,0.0,496,35,0.070565,461,0.929435,1.40,0.11525,-2.497116,61.714854
2,VAR,"(0.0, 23.0]",1.0,23.0,190,20,0.105263,170,0.894737,0.80,0.04250,-2.935085,61.714854
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,VAR,"(3913.0, 6102.0]",3921.0,6102.0,226,38,0.168142,188,0.831858,1.52,0.04700,-3.476297,61.714854
18,VAR,"(6102.0, 71188.0]",6145.0,71188.0,226,20,0.088496,206,0.911504,0.80,0.05150,-2.743012,61.714854
19,VAR,,,,0,0,0.000000,0,0.000000,0.00,0.00000,0.000000,61.714854


In [6]:
example_cat

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,VAR,admin.,admin.,admin.,478,58,0.121339,420,0.878661,0.111324,0.10500,-0.058487,0.132516
1,VAR,blue-collar,blue-collar,blue-collar,946,69,0.072939,877,0.927061,0.132438,0.21925,0.504098,0.132516
2,VAR,entrepreneur,entrepreneur,entrepreneur,168,15,0.089286,153,0.910714,0.028791,0.03825,0.284080,0.132516
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,VAR,unemployed,unemployed,unemployed,128,13,0.101562,115,0.898437,0.024952,0.02875,0.141678,0.132516
11,VAR,unknown,unknown,unknown,38,7,0.184211,31,0.815789,0.013436,0.00775,-0.550168,0.132516
12,VAR,,,,0,0,0.000000,0,0.000000,0.000000,0.00000,0.000000,0.132516


### 3. Make 'numeric' data monotonic by WOE

In [7]:
monotonic_example_num = make_monotonic(example_num)
monotonic_example_num

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,VAR,"(-3313.001, -162.0]",-3313.0,-162.0,227.0,25.0,0.110132,202.0,0.889868,1.0,0.0505,-2.985763,2.834989
1,VAR,"(-162.0, 71188.0]",-154.0,71188.0,4294.0,496.0,0.11551,3798.0,0.88449,0.952015,0.9495,-0.002646,2.834989
2,VAR,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.834989


### 4. Use Algotithm of monotonic optimal binning for 'numeric' feature

In [8]:
optimal_example_num = monotone_optimal_binning(X_num, Y)
optimal_example_num

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,VAR,"(-1000000000.0, 6102.0]",-3313.0,6102.0,4295.0,501.0,0.116647,3794.0,0.883353,0.961612,0.9485,-0.01373,0.004033
1,VAR,"(6102.0, 1000000000.0]",6145.0,71188.0,226.0,20.0,0.088496,206.0,0.911504,0.038388,0.0515,0.293838,0.004033
2,VAR,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004033


### 5. Apply previous steps for each feature

In [9]:
full_stats, iv_values = create_bins_df(data)
full_stats

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,age,"(-1000000000.0, 29.0]",19,29,482.0,74.0,0.153527,408.0,0.846473,0.142035,0.1020,-0.331095,0.015081
1,age,"(29.0, 1000000000.0]",30,87,4039.0,447.0,0.110671,3592.0,0.889329,0.857965,0.8980,0.045606,0.015081
2,age,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.015081
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,poutcome,success,success,success,129.0,83.0,0.643411,46.0,0.356589,0.159309,0.0115,-2.628418,0.461878
68,poutcome,unknown,unknown,unknown,3705.0,337.0,0.090958,3368.0,0.909042,0.646833,0.8420,0.263691,0.461878
69,poutcome,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.461878


In [10]:
iv_values

Unnamed: 0,VAR_NAME,IV
0,age,1.508105e-02
1,balance,4.032908e-03
2,campaign,4.456053e-02
...,...,...
13,pdays,2.786846e-18
14,poutcome,4.618775e-01
15,previous,2.786846e-18


### 6. Cut of by Information Values

In [11]:
full_stats_cut, use_name_iv = cut_off_iv(full_stats, iv_values)
full_stats_cut

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,age,"(-1000000000.0, 29.0]",19,29,482.0,74.0,0.153527,408.0,0.846473,0.142035,0.1020,-0.331095,0.015081
1,age,"(29.0, 1000000000.0]",30,87,4039.0,447.0,0.110671,3592.0,0.889329,0.857965,0.8980,0.045606,0.015081
2,age,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.015081
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,poutcome,success,success,success,129.0,83.0,0.643411,46.0,0.356589,0.159309,0.0115,-2.628418,0.461878
68,poutcome,unknown,unknown,unknown,3705.0,337.0,0.090958,3368.0,0.909042,0.646833,0.8420,0.263691,0.461878
69,poutcome,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.461878


In [12]:
use_name_iv

array(['age', 'balance', 'campaign', 'contact', 'day', 'default',
       'duration', 'education', 'housing', 'job', 'loan', 'marital',
       'month', 'pdays', 'poutcome', 'previous'], dtype=object)

### 7. Replace all features by WOE

In [13]:
data_woe = replace_by_woe_naive(data, full_stats_cut, use_name_iv)
data_woe

Unnamed: 0,WOE_age,WOE_balance,WOE_campaign,WOE_contact,WOE_day,WOE_default,WOE_duration,WOE_education,WOE_housing,WOE_job,WOE_loan,WOE_marital,WOE_month,WOE_pdays,WOE_poutcome,WOE_previous,target
0,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.222811,-0.330234,0.141678,-0.090598,0.169697,-1.887938,1.669384e-09,0.263691,1.669384e-09,0
1,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,0.325551,0.261647,0.674385,0.169697,0.603057,1.669384e-09,-0.124649,1.669384e-09,0
2,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,-0.182478,-0.090598,-0.219950,-0.595584,1.669384e-09,-0.124649,1.669384e-09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4518,0.045606,-0.01373,0.592412,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.072278,-0.090598,0.169697,-0.090582,1.669384e-09,0.263691,1.669384e-09,0
4519,-0.331095,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.504098,-0.090598,0.169697,-0.460942,1.669384e-09,-0.606970,1.669384e-09,0
4520,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,0.284080,0.674385,-0.219950,-0.595584,1.669384e-09,-0.606970,1.669384e-09,0


### 8. Delete correlated features

#### All Correlation

In [14]:
df_woe_uncorr, corr_matrix, to_drop = delete_correlated_features(data_woe, iv_values, inplace=False)
corr_matrix

Unnamed: 0,WOE_poutcome,WOE_month,WOE_contact,WOE_job,WOE_housing,WOE_loan,WOE_campaign,WOE_marital,WOE_education,WOE_age,WOE_balance,WOE_default,WOE_day,WOE_duration,WOE_pdays,WOE_previous
WOE_poutcome,,0.1959,0.166931,0.0548382,0.0957985,0.0402594,0.0672918,0.0226195,0.027375,0.0279067,0.0276973,0.0333562,0,0,0,0
WOE_month,,,0.420767,0.118976,0.346633,0.131354,0.148541,0.0624621,0.10649,0.12235,0.10517,0.0438847,0,0,0,0
WOE_contact,,,,0.129234,0.154812,0.00597174,0.0115889,0.0466537,0.104116,0.014422,0.0246698,0.0167352,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOE_duration,,,,,,,,,,,,,,,0,0
WOE_pdays,,,,,,,,,,,,,,,,0
WOE_previous,,,,,,,,,,,,,,,,


#### Inplace deletion

In [15]:
_, corr_matrix_inplace, _ = delete_correlated_features(data_woe, iv_values, inplace=True)
corr_matrix_inplace

Unnamed: 0,WOE_poutcome,WOE_month,WOE_contact,WOE_job,WOE_housing,WOE_loan,WOE_campaign,WOE_marital,WOE_education,WOE_age,WOE_balance,WOE_default,WOE_day,WOE_duration,WOE_pdays,WOE_previous
WOE_poutcome,,0.1959,0.166931,0.0548382,0.0957985,0.0402594,0.0672918,0.0226195,0.027375,0.0279067,0.0276973,0.0333562,0,0,0,0
WOE_month,,,0.420767,0.118976,0.346633,0.131354,0.148541,0.0624621,0.10649,0.12235,0.10517,0.0438847,0,0,0,0
WOE_contact,,,,0.129234,0.154812,0.00597174,0.0115889,0.0466537,0.104116,0.014422,0.0246698,0.0167352,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOE_duration,,,,,,,,,,,,,,,0,0
WOE_pdays,,,,,,,,,,,,,,,,0
WOE_previous,,,,,,,,,,,,,,,,


#### Columns to drop

In [16]:
print('Features to drop', *to_drop, sep='\n')

Features to drop


#### Final dataframe

In [17]:
df_woe_uncorr

Unnamed: 0,WOE_age,WOE_balance,WOE_campaign,WOE_contact,WOE_day,WOE_default,WOE_duration,WOE_education,WOE_housing,WOE_job,WOE_loan,WOE_marital,WOE_month,WOE_pdays,WOE_poutcome,WOE_previous,target
0,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.222811,-0.330234,0.141678,-0.090598,0.169697,-1.887938,1.669384e-09,0.263691,1.669384e-09,0
1,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,0.325551,0.261647,0.674385,0.169697,0.603057,1.669384e-09,-0.124649,1.669384e-09,0
2,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,-0.182478,-0.090598,-0.219950,-0.595584,1.669384e-09,-0.124649,1.669384e-09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4518,0.045606,-0.01373,0.592412,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.072278,-0.090598,0.169697,-0.090582,1.669384e-09,0.263691,1.669384e-09,0
4519,-0.331095,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.504098,-0.090598,0.169697,-0.460942,1.669384e-09,-0.606970,1.669384e-09,0
4520,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,0.284080,0.674385,-0.219950,-0.595584,1.669384e-09,-0.606970,1.669384e-09,0


# Full Pipeline

In [18]:
%%time
df_woe_uncorr, corr_matrix, to_drop, iv_values, full_stats, use_name_iv = start_pipeline(data)

Replace on WOE optimal = 0.028 seconds
Creating full stats = 2.153 seconds
Cutting of by IV = 0.002 seconds
Delete correlations = 0.965 seconds
Wall time: 3.15 s


In [19]:
df_woe_uncorr

Unnamed: 0,WOE_age,WOE_balance,WOE_campaign,WOE_contact,WOE_day,WOE_default,WOE_duration,WOE_education,WOE_housing,WOE_job,WOE_loan,WOE_marital,WOE_month,WOE_pdays,WOE_poutcome,WOE_previous,target
0,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.222811,-0.330234,0.141678,-0.090598,0.169697,-1.887938,1.669384e-09,0.263691,1.669384e-09,0
1,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,0.325551,0.261647,0.674385,0.169697,0.603057,1.669384e-09,-0.124649,1.669384e-09,0
2,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,-0.182478,-0.090598,-0.219950,-0.595584,1.669384e-09,-0.124649,1.669384e-09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4518,0.045606,-0.01373,0.592412,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.072278,-0.090598,0.169697,-0.090582,1.669384e-09,0.263691,1.669384e-09,0
4519,-0.331095,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,0.091389,-0.330234,0.504098,-0.090598,0.169697,-0.460942,1.669384e-09,-0.606970,1.669384e-09,0
4520,0.045606,-0.01373,-0.075498,-0.252971,1.669384e-09,0.000534,1.669384e-09,-0.247403,0.325551,0.284080,0.674385,-0.219950,-0.595584,1.669384e-09,-0.606970,1.669384e-09,0


In [20]:
corr_matrix

Unnamed: 0,WOE_poutcome,WOE_month,WOE_contact,WOE_job,WOE_housing,WOE_loan,WOE_campaign,WOE_marital,WOE_education,WOE_age,WOE_balance,WOE_default,WOE_day,WOE_duration,WOE_pdays,WOE_previous
WOE_poutcome,,0.1959,0.166931,0.0548382,0.0957985,0.0402594,0.0672918,0.0226195,0.027375,0.0279067,0.0276973,0.0333562,0,0,0,0
WOE_month,,,0.420767,0.118976,0.346633,0.131354,0.148541,0.0624621,0.10649,0.12235,0.10517,0.0438847,0,0,0,0
WOE_contact,,,,0.129234,0.154812,0.00597174,0.0115889,0.0466537,0.104116,0.014422,0.0246698,0.0167352,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOE_duration,,,,,,,,,,,,,,,0,0
WOE_pdays,,,,,,,,,,,,,,,,0
WOE_previous,,,,,,,,,,,,,,,,


In [21]:
to_drop

[]

In [22]:
iv_values.sort_values('IV', ascending=False)

Unnamed: 0,VAR_NAME,IV
14,poutcome,4.618775e-01
12,month,3.795165e-01
3,contact,2.477613e-01
...,...,...
6,duration,2.786846e-18
13,pdays,2.786846e-18
15,previous,2.786846e-18


In [23]:
full_stats

Unnamed: 0,VAR_NAME,BUCKET,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,age,"(-1000000000.0, 29.0]",19,29,482.0,74.0,0.153527,408.0,0.846473,0.142035,0.1020,-0.331095,0.015081
1,age,"(29.0, 1000000000.0]",30,87,4039.0,447.0,0.110671,3592.0,0.889329,0.857965,0.8980,0.045606,0.015081
2,age,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.015081
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,poutcome,success,success,success,129.0,83.0,0.643411,46.0,0.356589,0.159309,0.0115,-2.628418,0.461878
68,poutcome,unknown,unknown,unknown,3705.0,337.0,0.090958,3368.0,0.909042,0.646833,0.8420,0.263691,0.461878
69,poutcome,,,,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0000,0.000000,0.461878


### Check on example with NULL

In [24]:
%%time
data = pd.read_excel('../data/bank_null.xlsx')
df_woe_uncorr, corr_matrix, to_drop, iv_values, full_stats, use_names = start_pipeline(data)

Replace on WOE optimal = 0.027 seconds
Creating full stats = 2.146 seconds
Cutting of by IV = 0.002 seconds
Delete correlations = 1.066 seconds
Wall time: 3.86 s
