In [1]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
sys.path.append('/Users/libing/codes/woe-encoder/')
# sys.path.append('/Users/bingli/codes/woe-encoder/')

In [3]:
from sklearn.datasets import load_boston

bunch = load_boston()
data = pd.DataFrame(bunch.data, columns=bunch.feature_names)
y = bunch.target > 22.5
data['y'] = y

continous_feature = 'CRIM'

In [4]:
from woe_encoder import ContinuousWOEEncoder

In [5]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,True
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True


**不管使用哪一种分箱方法，`col_name` 和 `target_col_name` 都是要必须指定的参数。**

In [6]:
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y')
encoder.fit(data)

ContinuousWOEEncoder(col_name='CRIM', target_col_name='y')

# 1 基于阈值的卡方分箱法（默认）

In [7]:
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',         # default
    min_chi2_flag=True,        # default
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.113558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.113558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.113558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.120051
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.113558


In [8]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46.0,100.0,0.684932,1.113558,0.360333
1,0.09266,0.62356,92.0,73.0,0.442424,0.120051,0.004744
2,0.62356,5.82401,71.0,30.0,0.29703,-0.494093,0.047194
3,5.82401,inf,88.0,6.0,0.06383,-2.194141,0.587126


In [9]:
encoder.iv_

0.9993971114646631

# 2. 基于最大分箱数的卡方分箱法

**参数：**

- `max_bins=10`


- `woe_method='chi2'`


- `min_chi2_flag=False`

In [10]:
# 基于最大分箱数的卡方分箱法
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=3,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.113558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.113558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.113558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,-0.10689
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.113558


In [11]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46.0,100.0,0.684932,1.113558,0.360333
1,0.09266,5.82401,163.0,103.0,0.387218,-0.10689,0.005986
2,5.82401,inf,88.0,6.0,0.06383,-2.194141,0.587126


In [12]:
encoder.iv_

0.953444515916242

# 3. 基于最大分箱数的坏样本率差异最大化分箱方法

**参数：**

- `max_bins=10`


- `woe_method='chi2'`


- `min_chi2_flag=False`

In [13]:
# 基于最大分箱数的坏样本率差异最大化分箱方法
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=5,                # just for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='bad_rate',
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,0.473289
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,0.473289
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,0.473289
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.473289
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,0.473289


In [14]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.32982,127.0,144.0,0.531365,0.473289,0.123711
1,0.32982,inf,170.0,65.0,0.276596,-0.603423,0.157726


In [15]:
encoder.iv_

0.2814372908278145

In [16]:
# 基于最大分箱数的坏样本率差异最大化分箱方法
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=3,                # just for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='bad_rate',
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,-0.001401
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,-0.001401
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,-0.001401
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,-0.001401
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,-0.001401


In [17]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,inf,297.0,209.0,0.413043,-0.001401,-0.0


# 4. 特殊值与缺失值

- <font color='crimson'>如果特征中含有需要特殊对待的一个/多个值，可以单独作为一个/多个 bin 处理。</font>


- <font color='crimson'>如果特征中含有缺失值，可以将缺失值单独作为一个 bin 来处理，你只需要指定一个不会引起混淆的值来 fill 缺失值。</font>

<br>

**参数：**

- `special_value_list=[value_1]` or `special_value_list=[value_1, value_2]`



- `imputation_value=xxxx`

<br>

以下使用【基于最大分箱数的卡方分箱法】来介绍这里的功能。

In [18]:
# 一个缺失值
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=6,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    special_value_list=[0.01501]
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.093558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.093558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.093558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.120051
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.093558


In [19]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46,98,0.680556,1.093558,0.343396
1,0.09266,0.62356,92,73,0.442424,0.120051,0.004744
2,0.62356,5.82401,71,30,0.29703,-0.494093,0.047194
3,5.82401,inf,88,6,0.0638298,-2.194141,0.587126
4,0.01501,0.01501,0,2,1.0,1.447198,0.013849


In [20]:
# 多个缺失值
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=6,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    special_value_list=[0.01501, 14.33370]
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.093558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.093558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.093558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.120051
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.093558


In [21]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46,98,0.680556,1.093558,0.343396
1,0.09266,0.62356,92,73,0.442424,0.120051,0.004744
2,0.62356,5.82401,71,30,0.29703,-0.494093,0.047194
3,5.82401,inf,86,6,0.0652174,-2.171413,0.566422
4,0.01501,0.01501,0,2,1.0,1.447198,0.013849
5,14.3337,14.3337,2,0,0.0,-0.750027,0.005051


In [22]:
data_transformed.loc[
    data_transformed[continous_feature].isin((0.01501, 14.3337)),
    [continous_feature, continous_feature+'_woe']]

Unnamed: 0,CRIM,CRIM_woe
283,0.01501,1.447198
348,0.01501,1.447198
388,14.3337,-0.750027
479,14.3337,-0.750027


In [23]:
# 生成缺失值
data[continous_feature] = data[continous_feature].where(
    data[continous_feature] < 5.8, np.nan)
data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     410 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  y        506 non-null    bool   
dtypes: bool(1), float64(13)
memory usage: 52.0 KB


In [24]:
# 缺失值处理
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=6,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    imputation_value=10000.,
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.113558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.113558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.113558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.120051
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.113558


In [25]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46.0,100.0,0.684932,1.113558,0.360333
1,0.09266,0.62356,92.0,73.0,0.442424,0.120051,0.004744
2,0.62356,inf,70.0,29.0,0.292929,-0.512897,0.049717
3,10000.0,10000.0,89.0,7.0,0.072917,-2.071783,0.551447


In [26]:
data_transformed.loc[data_transformed[continous_feature].isnull(),
                     [continous_feature, continous_feature+'_woe']].head()

Unnamed: 0,CRIM,CRIM_woe
356,,-2.071783
367,,-2.071783
370,,-2.071783
371,,-2.071783
372,,-2.071783


In [27]:
# 缺失值 + 特殊值
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=6,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    imputation_value=10000.,
    special_value_list=[0.01501]
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.093558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.093558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.093558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,0.120051
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.093558


In [28]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46,98,0.680556,1.093558,0.343396
1,0.09266,0.62356,92,73,0.442424,0.120051,0.004744
2,0.62356,inf,70,29,0.292929,-0.512897,0.049717
3,0.01501,0.01501,0,2,1.0,1.447198,0.013849
4,10000.0,10000.0,89,7,0.0729167,-2.071783,0.551447


In [29]:
data_transformed.loc[
    data_transformed[continous_feature].isnull() | data_transformed[continous_feature].isin([0.01501]),
    [continous_feature, continous_feature+'_woe']
].head()

Unnamed: 0,CRIM,CRIM_woe
283,0.01501,1.447198
348,0.01501,1.447198
356,,-2.071783
367,,-2.071783
370,,-2.071783


# 5. 单调性

因为业务需要，有的特征转换之后需要满足单调性（或者符合 U 型）。


**参数：**

- `need_monotonic=True`



- `u=True `

In [30]:
bunch = load_boston()
data = pd.DataFrame(bunch.data, columns=bunch.feature_names)
y = bunch.target > 22.5
data['y'] = y

continous_feature = 'CRIM'

In [31]:
# 单调性
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=3,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    need_monotonic=True,
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.113558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.113558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.113558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,-0.10689
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.113558


In [32]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46.0,100.0,0.684932,1.113558,0.360333
1,0.09266,5.82401,163.0,103.0,0.387218,-0.10689,0.005986
2,5.82401,inf,88.0,6.0,0.06383,-2.194141,0.587126


In [33]:
# U 型
encoder = ContinuousWOEEncoder(
    col_name=continous_feature,
    target_col_name='y',
    max_bins=3,                # for illustration
    bin_pct_threshold=0.05,    # default
    woe_method='chi2',
    min_chi2_flag=False,
    need_monotonic=True,       # must 
    u=True                     # must
)
data_transformed = encoder.fit_transform(data)
data_transformed.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,CRIM_woe
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,False,1.113558
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,False,1.113558
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,True,1.113558
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,False,-0.10689
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,False,1.113558


In [34]:
encoder.bin_result_

Unnamed: 0,left_exclusive,right_inclusive,good_num,bad_num,bad_rate,woe,iv
0,-inf,0.09266,46.0,100.0,0.684932,1.113558,0.360333
1,0.09266,5.82401,163.0,103.0,0.387218,-0.10689,0.005986
2,5.82401,inf,88.0,6.0,0.06383,-2.194141,0.587126
