In [1]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
# sys.path.append('/Users/libing/codes/woe-encoder/')
sys.path.append('/Users/bingli/codes/woe-encoder/')

In [3]:
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

Pandas version: 1.0.4
NumPy version: 1.18.1


# 1. 加载数据

In [5]:
from sklearn.datasets import load_boston

bunch = load_boston()
data = pd.DataFrame(bunch.data, columns=bunch.feature_names)
y = bunch.target > 22.5
data['y'] = y

category_feature = 'RAD'
continous_feature = 'CRIM'

> 测试数据集来源于 [sklearn-contrib](https://github.com/scikit-learn-contrib/category_encoders/blob/master/category_encoders/woe.py)。

In [6]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,True
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True


In [7]:
data.shape

(506, 14)

# 2. `CategoryWOEEncoder`

In [8]:
from woe_encoder import CategoryWOEEncoder  # My
from category_encoders import WOEEncoder    # sklearn contrib

## 2.1 `CategoryWOEEncoder` V.S. sklearn-contrib `category_encoders.WOEEncoder`

[sklearn-contrib](https://github.com/scikit-learn-contrib/category_encoders/blob/master/category_encoders/woe.py) 只实现了离散型特征的 WOE 转换，且没有任何可设置条件。它将所有离散型值当作一个 bin，没有 bin 的合并操作，没有任何 attributes，只有结果。

In [9]:
# sklearn-contrib WOE encoder
sklearn_contrib_encoder = WOEEncoder(cols=[category_feature])
sklearn_contrib_encoder.fit(data, y)  # y 不在 data 中

WOEEncoder(cols=['RAD'], drop_invariant=False, handle_missing='value',
           handle_unknown='value', random_state=None, randomized=False,
           regularization=1.0, return_df=True, sigma=0.05, verbose=0)

In [10]:
data_sklearn = sklearn_contrib_encoder.transform(data)
data_sklearn[category_feature]

0      0.166264
1      0.658740
2      0.658740
3      0.967625
4      0.967625
         ...   
501    0.166264
502    0.166264
503    0.166264
504    0.166264
505    0.166264
Name: RAD, Length: 506, dtype: float64

In [11]:
# my WOE encoder
my_woe_encoder = CategoryWOEEncoder(
    col_name=category_feature,  # 必须指定
    target_col_name='y',        # 必须指定
    max_bins=1000,
    bin_pct_threshold=0.,
    woe_method='chi2',
    min_chi2_flag=False)
data_my = my_woe_encoder.fit_transform(data)
data_my[category_feature+'_woe']

0      0.166264
1      0.658740
2      0.658740
3      0.967625
4      0.967625
         ...   
501    0.166264
502    0.166264
503    0.166264
504    0.166264
505    0.166264
Name: RAD_woe, Length: 506, dtype: float64

**<font color='blue'>相同的条件下，我们的结果和 sklearn-contrib 的结果一摸一样。但是我们有很多可定制化的功能，一起看看吧。</font>**

In [12]:
my_woe_encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.26087,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.051383,-0.398629,0.008901
2,[4.0],110,41.0,69.0,0.372727,0.217391,-0.16224,0.005865
3,[1.0],20,9.0,11.0,0.45,0.039526,0.166264,0.001002
4,[5.0],115,62.0,53.0,0.53913,0.227273,0.502736,0.059423
5,[2.0],24,14.0,10.0,0.583333,0.047431,0.65874,0.021946
6,[3.0],38,25.0,13.0,0.657895,0.075099,0.967625,0.073391
7,[7.0],17,13.0,4.0,0.764706,0.033597,1.378205,0.067164
8,[8.0],24,19.0,5.0,0.791667,0.047431,1.552558,0.115004


## 2.2 `CategoryWOEEncoder` 介绍

在 `CategoryWOEEncoder` 中实现的 WOE 转换方法有：

- **基于卡方值的分箱方法**

  - 按照卡方阈值停止（默认选项）
  
  - 按照最大箱数停止
 

- **基于坏样本率差异最大化的分箱**

  - 按照最大箱数停止


<br>

**<font color='blue'>不管使用哪种方法，你都需要指定这 2 个参数——`col_name` 和 `target_col_name`。</font>**

### 2.2.1 基于阈值的卡方分箱法（默认）

<font color='crimson'>这是默认配置（选项）。</font>

**参数：**

- `woe_method='chi2'`


- `min_chi2_flag=True`

In [13]:
# 按照卡方阈值（默认 3.841）停止
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本数
    woe_method='chi2',       # default, 基于卡方的分箱
    min_chi2_flag=True,      # default, 按照卡方阈值停止
    confidence=3.841,        # default, 卡方阈值
)
encoder.fit(data)

CategoryWOEEncoder(bin_pct_threshold=0.05, col_name='RAD', confidence=3.841,
                   imputation_value=None, max_bins=10, min_chi2_flag=True,
                   need_monotonic=False, regularization=1.0,
                   special_value_list=None, target_col_name='y', u=False,
                   value_order_dict=None, woe_method='chi2')

In [14]:
data_transformed = encoder.transform(data)
data_transformed.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,RAD_woe
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,True,-0.168997
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False,0.533508
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True,0.533508
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True,1.273534
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True,1.273534


In [15]:
encoder.bin_result_   # 分箱结果的展示

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.26087,-1.451908,0.432253
1,"[6.0, 4.0, 1.0]",156,58.0,98.0,0.371795,0.3083,-0.168997,0.008865
2,"[5.0, 2.0]",139,76.0,63.0,0.546763,0.274704,0.533508,0.080835
3,"[3.0, 7.0, 8.0]",79,57.0,22.0,0.721519,0.156126,1.273534,0.252992


In [16]:
encoder.iv_   # 分箱的评估指标——IV 值

0.7749440157846403

In [17]:
encoder.bin_woe_mapping_  # 特征值与其 woe 值的映射

{24.0: -1.4519077092821897,
 6.0: -0.16899696631425057,
 4.0: -0.16899696631425057,
 1.0: -0.16899696631425057,
 5.0: 0.533507778408632,
 2.0: 0.533507778408632,
 3.0: 1.2735342345318896,
 7.0: 1.2735342345318896,
 8.0: 1.2735342345318896}

### 2.2.2 基于最大分箱数的卡方分箱法

**参数：**

- `max_bins=10`


- `woe_method='chi2'`


- `min_chi2_flag=False`

In [18]:
# 按照最大箱数停止
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本数
    woe_method='chi2',       # default, 基于卡方的分箱
    min_chi2_flag=False,     # 按照最大分箱数停止
)
data_transformed = encoder.fit_transform(data)

In [19]:
encoder.bin_result_   # 分箱结果的展示

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.26087,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.051383,-0.398629,0.008901
2,"[4.0, 1.0]",130,50.0,80.0,0.384615,0.256917,-0.114038,0.003435
3,"[5.0, 2.0]",139,76.0,63.0,0.546763,0.274704,0.533508,0.080835
4,[3.0],38,25.0,13.0,0.657895,0.075099,0.967625,0.073391
5,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.081028,1.542508,0.189431


In [20]:
encoder.iv_  # 分箱的评估指标——IV 值

0.7882454643406109

In [21]:
encoder.bin_woe_mapping_  # 特征值与其 woe 值的映射

{24.0: -1.4519077092821897,
 6.0: -0.398628961915601,
 4.0: -0.11403808203349304,
 1.0: -0.11403808203349304,
 5.0: 0.533507778408632,
 2.0: 0.533507778408632,
 3.0: 0.9676246483208434,
 7.0: 1.5425079083870545,
 8.0: 1.5425079083870545}

### 2.2.3 基于最大分箱数的坏样本率差异最大化分箱方法

**参数：**

- `max_bins=10`


- `woe_method='bad_rate'`

In [22]:
# 按照最大箱数停止
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本数
    woe_method='bad_rate',   # 基于坏样本率差异最大化的分箱
)
data_transformed = encoder.fit_transform(data)

In [23]:
encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.26087,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.051383,-0.398629,0.008901
2,"[4.0, 1.0]",130,50.0,80.0,0.384615,0.256917,-0.114038,0.003435
3,"[5.0, 2.0]",139,76.0,63.0,0.546763,0.274704,0.533508,0.080835
4,[3.0],38,25.0,13.0,0.657895,0.075099,0.967625,0.073391
5,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.081028,1.542508,0.189431


In [24]:
encoder.iv_

0.7882454643406109

### 2.2.4 处理特殊值和缺失值

- <font color='crimson'>如果特征中含有需要特殊对待的一个/多个值，可以单独作为一个/多个 bin 处理。</font>


- <font color='crimson'>如果特征中含有缺失值，可以将缺失值单独作为一个 bin 来处理，你只需要指定一个不会引起混淆的值来 fill 缺失值。</font>

以下使用【基于最大分箱数的卡方分箱法】来介绍这里的功能。

In [25]:
# 特征中有一个需要特殊对待的值
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本
    woe_method='chi2',
    min_chi2_flag=False,
    special_value_list=[2.]  # 必须是 list
)
data_transformed = encoder.fit_transform(data)
data_transformed.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,RAD_woe
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,True,-0.114038
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False,0.65874
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True,0.65874
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True,0.967625
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True,0.967625


In [26]:
encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.273859,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.053942,-0.398629,0.008901
2,"[4.0, 1.0]",130,50.0,80.0,0.384615,0.26971,-0.114038,0.003435
3,[5.0],115,62.0,53.0,0.53913,0.238589,0.502736,0.059423
4,[3.0],38,25.0,13.0,0.657895,0.078838,0.967625,0.073391
5,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.085062,1.542508,0.189431
6,[2.0],24,14.0,10.0,0.583333,0.047431,0.65874,0.021946


**特殊值单独作为一个 bin，放在最后面。**

In [27]:
encoder.iv_

0.7887804688689688

In [28]:
# 特征中有多个需要特殊对待的值
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本
    woe_method='chi2',
    min_chi2_flag=False,
    special_value_list=[2., 3.]  # 必须是 list
)
data_transformed = encoder.fit_transform(data)
data_transformed.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,RAD_woe
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,True,-0.114038
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False,0.65874
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True,0.65874
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True,0.967625
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True,0.967625


In [29]:
encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.297297,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.058559,-0.398629,0.008901
2,"[4.0, 1.0]",130,50.0,80.0,0.384615,0.292793,-0.114038,0.003435
3,[5.0],115,62.0,53.0,0.53913,0.259009,0.502736,0.059423
4,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.092342,1.542508,0.189431
5,[2.0],24,14.0,10.0,0.583333,0.047431,0.65874,0.021946
6,[3.0],38,25.0,13.0,0.657895,0.075099,0.967625,0.073391


In [30]:
encoder.iv_

0.7887804688689688

In [32]:
# 特征中含有缺失值
data[category_feature] = data[category_feature].where(data[category_feature] != 1., np.nan)
data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      486 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  y        506 non-null    bool   
dtypes: bool(1), float64(13)
memory usage: 52.0 KB


In [33]:
# 将特征中的缺失值单独作为一个 bin
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本
    woe_method='chi2',
    min_chi2_flag=False,
    imputation_value=100.    # 给定缺失的填充值
)
data_transformed = encoder.fit_transform(data)
data_transformed.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,RAD_woe
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,,296.0,15.3,396.9,4.98,True,0.166264
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False,0.533508
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True,0.533508
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True,0.967625
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True,0.967625


In [34]:
encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.271605,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.053498,-0.398629,0.008901
2,[4.0],110,41.0,69.0,0.372727,0.226337,-0.16224,0.005865
3,"[5.0, 2.0]",139,76.0,63.0,0.546763,0.286008,0.533508,0.080835
4,[3.0],38,25.0,13.0,0.657895,0.078189,0.967625,0.073391
5,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.084362,1.542508,0.189431
6,[100.0],20,9.0,11.0,0.45,0.039526,0.166264,0.001002


缺失值单独作为一个 bin 放在最后面，上面结果中 `[100.0]` 表示缺失值。

In [35]:
encoder.iv_

0.7916768830161539

In [37]:
# 特征既含有需要特殊处理的值，也含有缺失值
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本
    woe_method='chi2',
    min_chi2_flag=False,
    special_value_list=[2., 3.],
    imputation_value=100.    # 给定缺失的填充值
)
data_transformed = encoder.fit_transform(data)
data_transformed.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y,RAD_woe
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,,296.0,15.3,396.9,4.98,True,0.166264
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,False,0.65874
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,True,0.65874
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,True,0.967625
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,True,0.967625


In [38]:
encoder.bin_result_

Unnamed: 0,RAD,bin_num,bad_num,good_num,bad_rate,bin_pct,woe,iv
0,[24.0],132,18.0,114.0,0.136364,0.311321,-1.451908,0.432253
1,[6.0],26,8.0,18.0,0.307692,0.061321,-0.398629,0.008901
2,[4.0],110,41.0,69.0,0.372727,0.259434,-0.16224,0.005865
3,[5.0],115,62.0,53.0,0.53913,0.271226,0.502736,0.059423
4,"[7.0, 8.0]",41,32.0,9.0,0.780488,0.096698,1.542508,0.189431
5,[2.0],24,14.0,10.0,0.583333,0.047431,0.65874,0.021946
6,[3.0],38,25.0,13.0,0.657895,0.075099,0.967625,0.073391
7,[100.0],20,9.0,11.0,0.45,0.039526,0.166264,0.001002


In [39]:
encoder.iv_

0.7922118875445117

### 2.2.5 有序离散特征

**参数：**

- `value_order_dict={value: order}`

In [41]:
# 特征的值是有序的，需要传入值与位置的映射
encoder = CategoryWOEEncoder(
    col_name=category_feature,
    target_col_name='y',
    max_bins=10,             # default, 最大分箱数
    bin_pct_threshold=0.05,  # default, 每个 bin 的最少样本
    woe_method='chi2',
    min_chi2_flag=False,
    imputation_value=100.,
    # 值的顺序字典
    value_order_dict={1.: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4, 6.0: 7, 7.0: 8, 8.0: 9, 24.0: 10}
)
data_transformed = encoder.fit_transform(data)
data_transformed.head()

TypeError: unhashable type: 'list'