In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler

In [2]:
from varMergeMethod import varMergeByPct, varMergeByChiSquare , varMergeByIVSplit, categoryToWOE

In [3]:
# 本例数据来源：https://www.kaggle.com/c/walmart-recruiting-trip-type-classification/data
dat = pd.read_csv('./data/train.csv', dtype={'TripType':'category'
                                             ,'VisitNumber':object
                                             , 'Upc':object
                                             , 'FinelineNumber':object})
dat2 = dat.dropna(axis=0, how='any') # 去除空值

### 1. 数值变量且可取值较少：
a) 不做处理<br>
b) 做一些数值变换，例如函数变换（e.g., np.log),标准化(e.g., StandardScaler),归一化(e.g., MinMaxScaler, MaxAbsScaler)<br>
c) 做OneHot， 可以参考种类变量直接OneHot的方式

### 2. 数值变量且可取值较多
a)不做处理<br>
b) 做一些数值变换，例如函数变换（e.g., np.log),标准化(e.g., StandardScaler),归一化(e.g., MinMaxScaler, MaxAbsScaler)<br>
c) 分箱后做OneHot: 等宽分箱，等深分箱等；分箱数量较多，可以再合并后再做OneHot

In [4]:
# 先把问题转换为一个2分类的问题， 将TripType分为40和其他类型
dat3 = dat2.copy()
dat3.loc[dat3.TripType!='40', 'Label'] = 0
dat3.loc[dat3.TripType=='40', 'Label'] = 1
dat4 = dat3[['ScanCount', 'Label']].copy()

In [5]:
dat4.ScanCount.describe()

count    642925.000000
mean          1.110203
std           0.701240
min         -12.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          71.000000
Name: ScanCount, dtype: float64

In [6]:
dat4.ScanCount.unique()

array([ -1,   1,   2,   3,   5,   6,   4,  14,  -2,   9,   7,  10,   8,
        -3,  -5,  11,  16,  -4,  13,  15,  30,  12,  20,  -6, -12,  19,
        46,  23,  -7,  22,  25,  24,  31,  -9,  51,  17,  18,  71, -10])

In [7]:
# 函数变换（np.log处理，当数值较大时可以采用此方法）
dat4.loc[:,'sc_log'] = np.log(abs(dat4['ScanCount'])) * np.sign(dat4['ScanCount'])

In [8]:
# 函数变换 (去中心化，减去均值)
c = np.mean(dat3['ScanCount'])
dat4.loc[:, 'sc_center'] = dat4['ScanCount'] - c

In [9]:
# 函数变换（标准化，减去均值，再除以标准差，可以直接用StandardScaler
standard_scaler = StandardScaler()
dat4.loc[:,'sc_standard'] = standard_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 
# dat3.ScanCount_center/np.std(dat3.ScanCount)

In [10]:
# 函数变换（归一化，把数据变为0~1范围内，可以直接用MinMaxScaler
minmax_scaler = MinMaxScaler()
dat4.loc[:,'sc_minmax'] = minmax_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 

In [11]:
# 函数变换（归一化，把数据变为-1~1范围内，可以直接用MaxAbsScaler
maxabs_scaler = MaxAbsScaler()
dat4.loc[:,'sc_absmax'] = maxabs_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 

In [12]:
# 函数变换（如果数据中有异常值，可以用RobustScaler，它是减去中位数再除以四分位距离，避免了异常值的影响
robust_scaler = RobustScaler()
dat4.loc[:,'sc_robust'] = robust_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 
# median = np.median(dat4['ScanCount'])
# q25, q75 = np.quantile(dat4['ScanCount'], [0.25, 0.75], axis=0)
# interval = q75 - q25
# print(median, q25, q75, interval)

In [13]:
dat4.describe()

Unnamed: 0,ScanCount,Label,sc_log,sc_center,sc_standard,sc_minmax,sc_absmax,sc_robust
count,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0
mean,1.110203,0.270593,0.090475,-1.311416e-14,5.194841e-14,0.157954,0.015637,0.110203
std,0.70124,0.444267,0.28365,0.7012401,1.000001,0.008449,0.009877,0.70124
min,-12.0,0.0,-2.484907,-13.1102,-18.69576,0.0,-0.169014,-13.0
25%,1.0,0.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
50%,1.0,0.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
75%,1.0,1.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
max,71.0,1.0,4.26268,69.8898,99.66608,1.0,1.0,70.0


In [14]:
# 等宽分箱：把变量的取值范围分为k个等宽的区间
cut_by_width, bin_bywidth = pd.cut(dat4['ScanCount'], bins=5, retbins=True)
bin_bywidth

array([-12.083,   4.6  ,  21.2  ,  37.8  ,  54.4  ,  71.   ])

In [15]:
# 等深分箱：把变量的取值个数等分为k个部分
cut_by_depth, bin_bydepth = pd.qcut(dat4['ScanCount'], q=5, retbins=True, duplicates='drop')
# 这个例子里ScanCount中1取值个数有557712个，占实际总个数的87%（总个数642925），在使用等深分箱时需要设置duplicates参数，去除重复的1
# 这里虽然设置了分箱数量是5，但由于1占比有87%，无法实现等深分箱，实际分完只有2个箱子
bin_bydepth 

array([-12.,   1.,  71.])

In [16]:
dat4.loc[:, 'sc_bin_width'] = cut_by_width
dat4.loc[:, 'sc_bin_depth'] = cut_by_depth
dat4.head()

Unnamed: 0,ScanCount,Label,sc_log,sc_center,sc_standard,sc_minmax,sc_absmax,sc_robust,sc_bin_width,sc_bin_depth
0,-1,0.0,-0.0,-2.110203,-3.009247,0.13253,-0.014085,-2.0,"(-12.083, 4.6]","(-12.001, 1.0]"
1,1,0.0,0.0,-0.110203,-0.157154,0.156627,0.014085,0.0,"(-12.083, 4.6]","(-12.001, 1.0]"
2,1,0.0,0.0,-0.110203,-0.157154,0.156627,0.014085,0.0,"(-12.083, 4.6]","(-12.001, 1.0]"
3,2,0.0,0.693147,0.889797,1.268892,0.168675,0.028169,1.0,"(-12.083, 4.6]","(1.0, 71.0]"
4,2,0.0,0.693147,0.889797,1.268892,0.168675,0.028169,1.0,"(-12.083, 4.6]","(1.0, 71.0]"


In [17]:
dat4.sc_bin_width.value_counts() # 等宽分箱

(-12.083, 4.6]    639803
(4.6, 21.2]         3105
(21.2, 37.8]          14
(37.8, 54.4]           2
(54.4, 71.0]           1
Name: sc_bin_width, dtype: int64

In [18]:
dat4.sc_bin_depth.value_counts() # 没有实现等深分箱

(-12.001, 1.0]    572920
(1.0, 71.0]        70005
Name: sc_bin_depth, dtype: int64

In [19]:
# 这里ScanCount数量不多，可以不分箱，而是选择合并后再做OneHot

In [20]:
# 采用卡方合并
varMergeByChiSquare(dat4, 'ScanCount', 'Label', bins=7)

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=-1.0,14097.0,1111.0
1,ScanCount,"(-1.0,1.0]",407629.0,150083.0
2,ScanCount,"(1.0,4.0]",45119.0,21764.0
3,ScanCount,"(4.0,5.0]",969.0,410.0
4,ScanCount,>5.0,1140.0,603.0


In [21]:
# 采用正样本率合并
varMergeByPct(dat4, 'ScanCount', 'Label', bins=7)

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=18.0,468933.0,173964.0
1,ScanCount,"(18.0,19.0]",1.0,3.0
2,ScanCount,"(19.0,23.0]",11.0,1.0
3,ScanCount,"(23.0,24.0]",2.0,2.0
4,ScanCount,"(24.0,46.0]",6.0,0.0
5,ScanCount,"(46.0,51.0]",0.0,1.0
6,ScanCount,>51.0,1.0,0.0


In [22]:
# 采用IV分割
varMergeByIVSplit(dat4, 'ScanCount', 'Label', bins=7)

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=-1.0,1111.0,14097.0
1,ScanCount,"(-1.0,1.0]",150083.0,407629.0
2,ScanCount,"(1.0,4.0]",21764.0,45119.0
3,ScanCount,"(4.0,5.0]",410.0,969.0
4,ScanCount,"(5.0,18.0]",596.0,1119.0
5,ScanCount,"(18.0,19.0]",3.0,1.0
6,ScanCount,>19.0,4.0,20.0


In [23]:
# 上面ScanCount不是一个很好的例子，因为1的占比过大，分箱后箱子数量很少
# 这里换一个变量FinelinNumber进行分箱-合并-OneHot的说明（不过也不是一个好例子， 因为FinelineNumber不是一个数值型，强硬转换为float了）
dat5 = dat3[['FinelineNumber', 'Label']].copy()
dat5.loc[:, 'FinelineNumber'] = dat3.FinelineNumber.astype(float)

In [24]:
# 等宽分箱：把变量的取值范围分为k个等宽的区间
cut_by_width2, bin_bywidth2 = pd.cut(dat5['FinelineNumber'], bins=10, retbins=True, right=False) 
# 默认区间为左开有闭，这里设置right=False, 则分割区间为左币又开
bin_bywidth2

array([    0.   ,   999.8  ,  1999.6  ,  2999.4  ,  3999.2  ,  4999.   ,
        5998.8  ,  6998.6  ,  7998.4  ,  8998.2  , 10007.998])

In [25]:
# 等深分箱：把变量的取值个数等分为k个部分
cut_by_depth2, bin_bydepth2 = pd.qcut(dat5['FinelineNumber'], q=10, retbins=True)
bin_bydepth2

array([   0.,  276., 1025., 1703., 2602., 3352., 4005., 4900., 6268.,
       8101., 9998.])

In [26]:
dat5.loc[:, 'sc_bin_width'] = cut_by_width2
dat5.loc[:, 'sc_bin_depth'] = cut_by_depth2
dat5.head()

Unnamed: 0,FinelineNumber,Label,sc_bin_width,sc_bin_depth
0,1000.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]"
1,8931.0,0.0,"[7998.4, 8998.2)","(8101.0, 9998.0]"
2,4504.0,0.0,"[3999.2, 4999.0)","(4005.0, 4900.0]"
3,3565.0,0.0,"[2999.4, 3999.2)","(3352.0, 4005.0]"
4,1017.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]"


In [27]:
dat5.sc_bin_width.value_counts() # 等宽

[0.0, 999.8)           119685
[2999.4, 3999.2)       104785
[999.8, 1999.6)         92718
[3999.2, 4999.0)        68963
[1999.6, 2999.4)        66969
[4999.0, 5998.8)        44784
[8998.2, 10007.998)     41829
[6998.6, 7998.4)        37957
[7998.4, 8998.2)        32649
[5998.8, 6998.6)        32586
Name: sc_bin_width, dtype: int64

In [28]:
dat5.sc_bin_depth.value_counts() # 等深

(6268.0, 8101.0]    65668
(3352.0, 4005.0]    64735
(2602.0, 3352.0]    64535
(-0.001, 276.0]     64375
(1025.0, 1703.0]    64337
(1703.0, 2602.0]    64324
(276.0, 1025.0]     64253
(4900.0, 6268.0]    64093
(4005.0, 4900.0]    63699
(8101.0, 9998.0]    62906
Name: sc_bin_depth, dtype: int64

In [29]:
# 在分箱的基础上再进行合并，已减少箱子的数量
# 例如在等深分箱的基础上合并，这时候sc_bin_depth实际上已经是类别变量，但是是有序的，要进行有序合并
dat5.groupby('sc_bin_depth').Label.count() # 等深

sc_bin_depth
(-0.001, 276.0]     64375
(276.0, 1025.0]     64253
(1025.0, 1703.0]    64337
(1703.0, 2602.0]    64324
(2602.0, 3352.0]    64535
(3352.0, 4005.0]    64735
(4005.0, 4900.0]    63699
(4900.0, 6268.0]    64093
(6268.0, 8101.0]    65668
(8101.0, 9998.0]    62906
Name: Label, dtype: int64

In [30]:
#varMergeByChiSquare(dat5, 'sc_bin_depth', 'Label', bins=5)
varMergeByChiSquare(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)
# 设置varInterval会，返回结果会对区间进行合并

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=1025.0,105440,23188
1,sc_bin_depth,"(1025.0,2602.0]",90956,37705
2,sc_bin_depth,"(2602.0,4900.0]",128681,64288
3,sc_bin_depth,"(4900.0,6268.0]",49901,14192
4,sc_bin_depth,>6268.0,93976,34598


In [31]:
varMergeByPct(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=1025.0,105440,23188
1,sc_bin_depth,"(1025.0,4900.0]",219637,101993
2,sc_bin_depth,"(4900.0,6268.0]",49901,14192
3,sc_bin_depth,"(6268.0,8101.0]",46584,19084
4,sc_bin_depth,>8101.0,47392,15514


In [32]:
varMergeByIVSplit(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=1025.0,23188.0,105440.0
1,sc_bin_depth,"(1025.0,2602.0]",37705.0,90956.0
2,sc_bin_depth,"(2602.0,4900.0]",64288.0,128681.0
3,sc_bin_depth,"(4900.0,6268.0]",14192.0,49901.0
4,sc_bin_depth,>6268.0,34598.0,93976.0


In [33]:
intervals = [1025, 2602, 4900, 6268] # 获取合并的位置， 这里使用了IV分割选取的位置

for i in range(len(intervals)+1):
    if i == 0:
        dat5.loc[dat5['FinelineNumber']<=intervals[i], 'merge_after_bin'] = i
    elif i == len(intervals):
        dat5.loc[dat5['FinelineNumber']>intervals[i-1], 'merge_after_bin'] = i
    else:
        dat5.loc[(dat5['FinelineNumber']>intervals[i-1])&(dat5['FinelineNumber']<=intervals[i]), 'merge_after_bin'] = i

In [34]:
dat5.head()

Unnamed: 0,FinelineNumber,Label,sc_bin_width,sc_bin_depth,merge_after_bin
0,1000.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]",0.0
1,8931.0,0.0,"[7998.4, 8998.2)","(8101.0, 9998.0]",4.0
2,4504.0,0.0,"[3999.2, 4999.0)","(4005.0, 4900.0]",2.0
3,3565.0,0.0,"[2999.4, 3999.2)","(3352.0, 4005.0]",2.0
4,1017.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]",0.0


In [35]:
len(dat5.sc_bin_depth.unique())

10

In [36]:
len(dat5.merge_after_bin.unique()) # 合并减少了分箱后的数量

5

In [37]:
# 分箱->[合并]->OneHot
onehot = pd.get_dummies(dat5['merge_after_bin'], drop_first=True)
onehot.head()

Unnamed: 0,1.0,2.0,3.0,4.0
0,0,0,0,0
1,0,0,0,1
2,0,1,0,0
3,0,1,0,0
4,0,0,0,0


In [38]:
# 分箱->[合并]->WOE编码
woe = categoryToWOE(dat5, 'merge_after_bin', 'Label', varOrder=True)

In [39]:
woe

Unnamed: 0,merge_after_bin,WOE
0,0.0,-0.522891
1,1.0,0.111033
2,2.0,0.297653
3,3.0,-0.265747
4,4.0,-0.007627


In [40]:
categoryToWOE(dat5, 'sc_bin_width', 'Label', varOrder=True)

Unnamed: 0,sc_bin_width,WOE
0,"[0.0, 999.8)",-0.538108
1,"[1999.6, 2999.4)",0.309603
2,"[2999.4, 3999.2)",0.267284
3,"[3999.2, 4999.0)",0.184809
4,"[4999.0, 5998.8)",-0.276587
5,"[5998.8, 6998.6)",-0.195808
6,"[6998.6, 7998.4)",0.394169
7,"[7998.4, 8998.2)",-0.453289
8,"[8998.2, 10007.998)",-0.027627
9,"[999.8, 1999.6)",0.072953
