In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler

In [2]:
# 本例数据来源：https://www.kaggle.com/c/walmart-recruiting-trip-type-classification/data
dat = pd.read_csv('./data/train.csv', dtype={'TripType':'category'
                                             ,'VisitNumber':object
                                             , 'Upc':object
                                             , 'FinelineNumber':object})
dat2 = dat.dropna(axis=0, how='any') # 去除空值

### 1. 数值变量且可取值较少：
a) 不做处理<br>
b) 做一些数值变换，例如函数变换（e.g., np.log),标准化(e.g., StandardScaler),归一化(e.g., MinMaxScaler, MaxAbsScaler)<br>
c) 做OneHot， 可以参考种类变量直接OneHot的方式

### 2. 数值变量且可取值较多
a)不做处理<br>
b) 做一些数值变换，例如函数变换（e.g., np.log),标准化(e.g., StandardScaler),归一化(e.g., MinMaxScaler, MaxAbsScaler)<br>
c) 分箱后做OneHot: 等宽分箱，等深分箱等；分箱数量较多，可以再合并后再做OneHot

In [3]:
# 先把问题转换为一个2分类的问题， 将TripType分为40和其他类型
dat3 = dat2.copy()
dat3.loc[dat3.TripType!='40', 'Label'] = 0
dat3.loc[dat3.TripType=='40', 'Label'] = 1
dat4 = dat3[['ScanCount', 'Label']].copy()

In [4]:
dat4.head()

Unnamed: 0,ScanCount,Label
0,-1,0.0
1,1,0.0
2,1,0.0
3,2,0.0
4,2,0.0


In [5]:
dat4.ScanCount.describe()

count    642925.000000
mean          1.110203
std           0.701240
min         -12.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          71.000000
Name: ScanCount, dtype: float64

In [6]:
dat4.ScanCount.unique()

array([ -1,   1,   2,   3,   5,   6,   4,  14,  -2,   9,   7,  10,   8,
        -3,  -5,  11,  16,  -4,  13,  15,  30,  12,  20,  -6, -12,  19,
        46,  23,  -7,  22,  25,  24,  31,  -9,  51,  17,  18,  71, -10])

In [7]:
# 函数变换（np.log处理，当数值较大时可以采用此方法）
dat4.loc[:,'sc_log'] = np.log(abs(dat4['ScanCount'])) * np.sign(dat4['ScanCount'])

In [8]:
# 函数变换 (去中心化，减去均值)
c = np.mean(dat3['ScanCount'])
dat4.loc[:, 'sc_center'] = dat4['ScanCount'] - c

In [9]:
# 函数变换（标准化，减去均值，再除以标准差，可以直接用StandardScaler
standard_scaler = StandardScaler()
dat4.loc[:,'sc_standard'] = standard_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 
# dat3.ScanCount_center/np.std(dat3.ScanCount)

In [10]:
# 函数变换（归一化，把数据变为0~1范围内，可以直接用MinMaxScaler
minmax_scaler = MinMaxScaler()
dat4.loc[:,'sc_minmax'] = minmax_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 

In [11]:
# 函数变换（归一化，把数据变为-1~1范围内，可以直接用MaxAbsScaler
maxabs_scaler = MaxAbsScaler()
dat4.loc[:,'sc_absmax'] = maxabs_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 

In [12]:
# 函数变换（如果数据中有异常值，可以用RobustScaler，它是减去中位数再除以四分位距离，避免了异常值的影响
robust_scaler = RobustScaler()
dat4.loc[:,'sc_robust'] = robust_scaler.fit_transform(dat4[['ScanCount']].astype(float)) 
# median = np.median(dat4['ScanCount'])
# q25, q75 = np.quantile(dat4['ScanCount'], [0.25, 0.75], axis=0)
# interval = q75 - q25
# print(median, q25, q75, interval)

In [13]:
dat4.describe()

Unnamed: 0,ScanCount,Label,sc_log,sc_center,sc_standard,sc_minmax,sc_absmax,sc_robust
count,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0,642925.0
mean,1.110203,0.270593,0.090475,-1.311416e-14,5.194841e-14,0.157954,0.015637,0.110203
std,0.70124,0.444267,0.28365,0.7012401,1.000001,0.008449,0.009877,0.70124
min,-12.0,0.0,-2.484907,-13.1102,-18.69576,0.0,-0.169014,-13.0
25%,1.0,0.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
50%,1.0,0.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
75%,1.0,1.0,0.0,-0.1102026,-0.157154,0.156627,0.014085,0.0
max,71.0,1.0,4.26268,69.8898,99.66608,1.0,1.0,70.0


In [14]:
# 等宽分箱：把变量的取值范围分为k个等宽的区间
cut_by_width, bin_bywidth = pd.cut(dat4['ScanCount'], bins=5, retbins=True)
bin_bywidth

array([-12.083,   4.6  ,  21.2  ,  37.8  ,  54.4  ,  71.   ])

In [15]:
# 等深分箱：把变量的取值个数等分为k个部分
cut_by_depth, bin_bydepth = pd.qcut(dat4['ScanCount'], q=5, retbins=True, duplicates='drop')
# 这个例子里ScanCount中1取值个数有557712个，占实际总个数的87%（总个数642925），在使用等深分箱时需要设置duplicates参数，去除重复的1
# 这里虽然设置了分箱数量是5，但由于1占比有87%，无法实现等深分箱，实际分完只有2个箱子
bin_bydepth 

array([-12.,   1.,  71.])

In [16]:
dat4.loc[:, 'sc_bin_width'] = cut_by_width
dat4.loc[:, 'sc_bin_depth'] = cut_by_depth
dat4.head()

Unnamed: 0,ScanCount,Label,sc_log,sc_center,sc_standard,sc_minmax,sc_absmax,sc_robust,sc_bin_width,sc_bin_depth
0,-1,0.0,-0.0,-2.110203,-3.009247,0.13253,-0.014085,-2.0,"(-12.083, 4.6]","(-12.001, 1.0]"
1,1,0.0,0.0,-0.110203,-0.157154,0.156627,0.014085,0.0,"(-12.083, 4.6]","(-12.001, 1.0]"
2,1,0.0,0.0,-0.110203,-0.157154,0.156627,0.014085,0.0,"(-12.083, 4.6]","(-12.001, 1.0]"
3,2,0.0,0.693147,0.889797,1.268892,0.168675,0.028169,1.0,"(-12.083, 4.6]","(1.0, 71.0]"
4,2,0.0,0.693147,0.889797,1.268892,0.168675,0.028169,1.0,"(-12.083, 4.6]","(1.0, 71.0]"


In [17]:
dat4.sc_bin_width.value_counts() # 等宽分箱

(-12.083, 4.6]    639803
(4.6, 21.2]         3105
(21.2, 37.8]          14
(37.8, 54.4]           2
(54.4, 71.0]           1
Name: sc_bin_width, dtype: int64

In [18]:
dat4.sc_bin_depth.value_counts() # 没有实现等深分箱

(-12.001, 1.0]    572920
(1.0, 71.0]        70005
Name: sc_bin_depth, dtype: int64

In [19]:
# 这里ScanCount数量不多，可以不分箱，而是选择合并后再做OneHot

In [20]:
# 例如按卡方合并（有的地方也叫卡方分箱，这里为了区别分箱和合并，把卡方合并，和其他合并放在一起讨论)
def ChiNumericalVarMerge(df, variable, flag, confidenceVal=3.841, bins=10, sample=None, varInterval=False):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param confidenceVal: float|卡方临界值（3.841是自由度1置信度95%对应的临界值，小于此值说明组还可以合并)
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup = regroup.drop('total_num', axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    # 处理连续没有正样本或负样本的区间，进行区间合并(如果连续区间的正样本或负样本数为0，则卡方计算的分母为0，所以要合并)
    i = 0 
    while(i <= np_regroup.shape[0] - 2):
        if((np_regroup[i,1] == 0 and np_regroup[i+1, 1]==0) 
           or (np_regroup[i, 2]==0 and np_regroup[i+1, 2] == 0)):
            np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i+1, 1] # 正样本合并
            np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i+1, 2] # 负样本合并
            np_regroup[i, 0] = np_regroup[i+1, 0] # 更新分箱变量范围
            np_regroup = np.delete(np_regroup, i+1, axis=0) # 删除整行
            i = i - 1
        i = i + 1
        
    # 对相邻两个区间进行卡方值计算(因为数值型变量大小已排序，算相邻即可)
    chi_table = np.array([]) # 保存项相邻两个区间的卡方值
    for i in np.arange(np_regroup.shape[0] - 1):
        chi = (np_regroup[i, 1] * np_regroup[i+1, 2] - np_regroup[i, 2] * np_regroup[i+1, 1])**2 * \
              (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i+1, 1] + np_regroup[i+1, 2]) / \
              ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i+1, 1] + np_regroup[i+1, 2]) * \
               (np_regroup[i, 1] + np_regroup[i+1, 1]) * (np_regroup[i, 2] + np_regroup[i+1, 2]))
        chi_table = np.append(chi_table, chi)
    
    # 把卡方值最小的两个区间合并, 直到分箱数量小于等于设定的分箱数量bins且相邻区间的卡方值没有大于设定的临界值confidenceVal的组了
    while(1):
        if(len(chi_table) <= (bins-1) and min(chi_table) >= confidenceVal):
            break
        chi_min_index = np.argwhere(chi_table == min(chi_table))[0] # 找出卡方值最小的索引位置
        np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1] # 正样本合并
        np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2] # 负样本合并
        np_regroup[chi_min_index, 0] = np_regroup[chi_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, chi_min_index + 1, axis=0)
        
        # 更新卡方值表 chi_table
        if(chi_min_index == np_regroup.shape[0] - 1): # 如果卡方最小值是最后两个区间的时候
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index-1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index-1, 2] * np_regroup[chi_min_index, 1])**2 * \
                  (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                  ((np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * \
                   (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 2]))      
            chi_table = np.delete(chi_table, chi_min_index, axis=0)
        elif(chi_min_index == 0): # 如果卡方最小值是最前面两个区间的时候
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index+1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index+1, 1])**2 * \
                  (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) / \
                  ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) * \
                   (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2]))      
            chi_table = np.delete(chi_table, chi_min_index+1, axis=0)
        else:
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index-1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index-1, 2] * np_regroup[chi_min_index, 1])**2 * \
                  (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                  ((np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * \
                   (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 2]))  
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index+1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index+1, 1])**2 * \
                  (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) / \
                  ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) * \
                   (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2]))      
            chi_table = np.delete(chi_table, chi_min_index+1, axis=0)
        
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
    
    list_temp = []
    for i in np.arange(np_regroup.shape[0]):
        if i==0:
            if varInterval:
                x = '<=' + str(np_regroup[i, 0].right)
            else:
                x = '<=' + str(np_regroup[i, 0])
        elif i==np_regroup.shape[0] - 1:
            if varInterval:
                x = '>' + str(np_regroup[i-1, 0].right)
            else:
                x = '>' + str(np_regroup[i-1, 0])
        else:
            if varInterval:
                x = '(' + str(np_regroup[i-1, 0].right) + ',' + str(np_regroup[i, 0].right)  + ']'
            else:
                x = '(' + str(np_regroup[i-1, 0]) + ',' + str(np_regroup[i, 0])  + ']'
        list_temp.append(x)
        
    result_data['interval'] = list_temp
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [21]:
chi_merge = ChiNumericalVarMerge(dat4, 'ScanCount', 'Label', bins=7)

In [22]:
chi_merge

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=-1.0,14097.0,1111.0
1,ScanCount,"(-1.0,1.0]",407629.0,150083.0
2,ScanCount,"(1.0,4.0]",45119.0,21764.0
3,ScanCount,"(4.0,5.0]",969.0,410.0
4,ScanCount,>5.0,1140.0,603.0


In [23]:
def PctNumericalVarMerge(df, variable, flag, bins=10, sample=None, varInterval=False):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup = regroup.drop('total_num', axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
        
    # 对相邻两个区间的正样本率组内正样本数/组内总样本数) 之差的绝对值进行计算（
    pctdiff_table = np.array([]) # 
    for i in np.arange(np_regroup.shape[0] - 1):
        pctdiff = abs(np_regroup[i+1, 1]/(np_regroup[i+1, 1] + np_regroup[i+1, 2]) 
                   - np_regroup[i, 1]/(np_regroup[i, 1] + np_regroup[i, 2]))
        pctdiff_table = np.append(pctdiff_table, pctdiff)
              
    # 把正样本率之差的绝对值最小的两个区间合并, 直到分箱数量小于设定的分箱数量bins
    while(1):
        if len(pctdiff_table) <= (bins-1):
            break
        pctdiff_min_index = np.argwhere(pctdiff_table == min(pctdiff_table))[0] # 找出正样本率之差的绝对值最小的索引位置
        np_regroup[pctdiff_min_index, 1] = np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index+1, 1] # 正样本合并
        np_regroup[pctdiff_min_index, 2] = np_regroup[pctdiff_min_index, 2] + np_regroup[pctdiff_min_index+1, 2] # 负样本合并
        np_regroup[pctdiff_min_index, 0] = np_regroup[pctdiff_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, pctdiff_min_index + 1, axis=0)
        
        # 更新正样本率之差 pctdiff_table
        if(pctdiff_min_index == np_regroup.shape[0] - 1): # 如果正样本率之差绝对值的最小值是最后两个区间的时候
            pctdiff_table[pctdiff_min_index - 1] = abs(np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]) 
                       - np_regroup[pctdiff_min_index-1, 1]/(np_regroup[pctdiff_min_index-1, 1] + np_regroup[pctdiff_min_index-1, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index, axis=0)

        elif(pctdiff_min_index == 0): # 如果正样本率之差绝对值的最小值是最前面两个区间的时候
            pctdiff_table[pctdiff_min_index] = abs(np_regroup[pctdiff_min_index+1, 1]/(np_regroup[pctdiff_min_index+1, 1] + np_regroup[pctdiff_min_index+1, 2]) 
                       - np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index+1, axis=0)
        else:
            pctdiff_table[pctdiff_min_index - 1] = abs(np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]) 
                       - np_regroup[pctdiff_min_index-1, 1]/(np_regroup[pctdiff_min_index-1, 1] + np_regroup[pctdiff_min_index-1, 2]))
            pctdiff_table[pctdiff_min_index] = abs(np_regroup[pctdiff_min_index+1, 1]/(np_regroup[pctdiff_min_index+1, 1] + np_regroup[pctdiff_min_index+1, 2]) 
                       - np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index+1, axis=0)
          
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
    
    list_temp = []
    for i in np.arange(np_regroup.shape[0]):
        if i==0:
            if varInterval:
                x = '<=' + str(np_regroup[i, 0].right)
            else:
                x = '<=' + str(np_regroup[i, 0])
        elif i==np_regroup.shape[0] - 1:
            if varInterval:
                x = '>' + str(np_regroup[i-1, 0].right)
            else:
                x = '>' + str(np_regroup[i-1, 0])
        else:
            if varInterval:
                x = '(' + str(np_regroup[i-1, 0].right) + ',' + str(np_regroup[i, 0].right)  + ']'
            else:
                x = '(' + str(np_regroup[i-1, 0]) + ',' + str(np_regroup[i, 0])  + ']'
        list_temp.append(x)

        
    result_data['interval'] = list_temp
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [24]:
pct_merge = PctNumericalVarMerge(dat4, 'ScanCount', 'Label', bins=7)

In [25]:
pct_merge # 这个显然不合适

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=18.0,468933.0,173964.0
1,ScanCount,"(18.0,19.0]",1.0,3.0
2,ScanCount,"(19.0,23.0]",11.0,1.0
3,ScanCount,"(23.0,24.0]",2.0,2.0
4,ScanCount,"(24.0,46.0]",6.0,0.0
5,ScanCount,"(46.0,51.0]",0.0,1.0
6,ScanCount,>51.0,1.0,0.0


In [26]:
def IVNumericalVarMerge(df, variable, flag, bins=10, sample=None, varInterval=False):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup = regroup.drop('total_num', axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    pos_class_totalcnt = regroup.positive_class.sum()
    neg_class_totalcnt = regroup.negative_class.sum()
    
    # 处理没有正样本或负样本的区间，进行区间合并(如果某个组的正样本或负样本数为0，则计算WOE会出现ln函数中分子或者分母为0的情况，无法计算)
    i = 0 
    while(i <= np_regroup.shape[0] - 1):
        if((np_regroup[i,1] == 0 or np_regroup[i, 2]==0)):
            if i == 0:
                np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i+1, 1] # 正样本合并
                np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i+1, 2] # 负样本合并
                np_regroup[i, 0] = np_regroup[i+1, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i+1, axis=0)
            else:
                np_regroup[i-1, 1] = np_regroup[i-1, 1] + np_regroup[i, 1] # 正样本合并
                np_regroup[i-1, 2] = np_regroup[i-1, 2] + np_regroup[i, 2] # 负样本合并
                np_regroup[i-1, 0] = np_regroup[i, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i, axis=0) # 删除整行
            i = i - 1
        i = i + 1
        
        
    # 对相邻两个区间进行iv之差计算(因为数值型变量大小已排序，算相邻即可)
    ivdiff_table = np.array([]) 
    for i in np.arange(np_regroup.shape[0] - 1):
        ivdiff = abs((np_regroup[i+1, 1]/pos_class_totalcnt-np_regroup[i+1, 2]/neg_class_totalcnt)*np.log(((np_regroup[i+1, 1]/pos_class_totalcnt)/(np_regroup[i+1, 2]/neg_class_totalcnt)).astype(float)) -\
                     (np_regroup[i, 1]/pos_class_totalcnt-np_regroup[i, 2]/neg_class_totalcnt)*np.log(((np_regroup[i, 1]/pos_class_totalcnt)/(np_regroup[i, 2]/neg_class_totalcnt)).astype(float)))
        ivdiff_table = np.append(ivdiff_table, ivdiff)
    
    # 把iv之差绝对值最小的两个区间合并, 直到分箱数量小于等于设定的分箱数量bins
    while(1):
        if len(ivdiff_table) <= (bins-1):
            break
        ivdiff_min_index = np.argwhere(ivdiff_table == min(ivdiff_table))[0] # 找出WOE之差绝对值最小的索引位置
        np_regroup[ivdiff_min_index, 1] = np_regroup[ivdiff_min_index, 1] + np_regroup[ivdiff_min_index+1, 1] # 正样本合并
        np_regroup[ivdiff_min_index, 2] = np_regroup[ivdiff_min_index, 2] + np_regroup[ivdiff_min_index+1, 2] # 负样本合并
        np_regroup[ivdiff_min_index, 0] = np_regroup[ivdiff_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, ivdiff_min_index + 1, axis=0)
        
        # 更新 ivdiff_table
        if(ivdiff_min_index == np_regroup.shape[0] - 1): # 如果iv之差的绝对值最小值是最后两个区间的时候
            ivdiff_table[ivdiff_min_index - 1] = abs((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index, axis=0)

        elif(ivdiff_min_index == 0): # 如果woe之差的绝对值最小值是最前面两个区间的时候
            ivdiff_table[ivdiff_min_index] = abs((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index+1, axis=0)

        else:
            ivdiff_table[ivdiff_min_index - 1] = abs((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table[ivdiff_min_index] = abs((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index+1, axis=0)  

        
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
    
    list_temp = []
    for i in np.arange(np_regroup.shape[0]):
        if i==0:
            if varInterval:
                x = '<=' + str(np_regroup[i, 0].right)
            else:
                x = '<=' + str(np_regroup[i, 0])
        elif i==np_regroup.shape[0] - 1:
            if varInterval:
                x = '>' + str(np_regroup[i-1, 0].right)
            else:
                x = '>' + str(np_regroup[i-1, 0])
        else:
            if varInterval:
                x = '(' + str(np_regroup[i-1, 0].right) + ',' + str(np_regroup[i, 0].right)  + ']'
            else:
                x = '(' + str(np_regroup[i-1, 0]) + ',' + str(np_regroup[i, 0])  + ']'
        list_temp.append(x)
        
    result_data['interval'] = list_temp
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [27]:
iv_merge = IVNumericalVarMerge(dat4, 'ScanCount', 'Label', bins=7)

In [28]:
iv_merge

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=-3.0,220.0,20.0
1,ScanCount,"(-3.0,-2.0]",738.0,64.0
2,ScanCount,"(-2.0,-1.0]",13139.0,1027.0
3,ScanCount,"(-1.0,1.0]",407629.0,150083.0
4,ScanCount,"(1.0,2.0]",35729.0,17195.0
5,ScanCount,"(2.0,3.0]",6326.0,3102.0
6,ScanCount,>3.0,5173.0,2480.0


In [29]:
def getMaxIVSplit(dat, pos_col, neg_col):     
    if dat.shape[0] <= 1:
        return None, None
    else:
        iv_list = np.array([]) 
        for i in range(dat.shape[0]-1):
            p1 = np.sum(dat[0:(i+1), pos_col])
            n1 = np.sum(dat[0:(i+1), neg_col])
            p2 = np.sum(dat[i+1:, pos_col])
            n2 = np.sum(dat[i+1:, neg_col])
            if (p1==0 or p2==0 or n1==0 or n2==0):
                iv_list = np.append(iv_list, -np.inf)
            else:
                iv1 = (p1/(p1+p2) - n1/(n1+n2)) * np.log((p1/(p1+p2))/(n1/(n1+n2)))
                iv2 = (p2/(p1+p2) - n2/(n1+n2)) * np.log((p2/(p1+p2))/(n2/(n1+n2)))
                iv_list = np.append(iv_list, iv1+iv2)
        
        iv_max = max(iv_list)
        
        iv_split_index = np.argwhere(iv_list == iv_max)[0]
        return iv_max, iv_split_index

In [30]:
# 利用IV信息量不断拆分

def IVNumericalSplit(df, variable, flag, bins=10, sample=None, varInterval=False):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup = regroup.drop(['total_num'], axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    split_table = np.array([])
    for t in range(bins-1):
        iv_best = None
        iv_best_index = None
        if t == 0:        
            iv_best, iv_best_index = getMaxIVSplit(np_regroup, pos_col=1, neg_col=2)
            split_table = np.append(split_table, iv_best_index)
        else:
            for s in range(len(split_table)):
                if s == 0:
                    start = 0
                    end = int(split_table[s])+1
                    iv_best, iv_best_index = getMaxIVSplit(np_regroup[start:end,:], pos_col=1, neg_col=2)
                elif s>=1 and s <= len(split_table) - 1:
                    start = int(split_table[s-1])+1
                    end = int(split_table[s])+1
                    iv_max, iv_split_index = getMaxIVSplit(np_regroup[start:end,:], pos_col=1, neg_col=2)
                    if iv_max is not None and iv_max > iv_best:
                        iv_best = iv_max
                        iv_best_index = iv_split_index + split_table[s-1] + 1
                # 如果是最后一个split        
                if s == len(split_table)-1:  
                    # 最后一段
                    start = int(split_table[s])+1
                    iv_max, iv_split_index = getMaxIVSplit(np_regroup[start:,:], pos_col=1, neg_col=2)
                    if iv_max is not None and iv_max > iv_best:
                        iv_best = iv_max
                        iv_best_index = iv_split_index + split_table[s] + 1
            split_table = np.append(split_table, iv_best_index)
            split_table = np.sort(split_table) # 每次加入后要重新排序
    
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * bins
    
    list_temp = []
    list_pos = []
    list_neg = []
    for s in range(len(split_table)):
        if s == 0:
            start = 0 
            end = int(split_table[s])+1
            if varInterval:
                x = '<=' + str(np_regroup[end-1, 0].right)
            else:
                x = '<=' + str(np_regroup[end-1, 0])
        elif s >= 1 and s <= len(split_table) - 1:
            start = int(split_table[s-1])+1
            end = int(split_table[s])+1
            if varInterval:
                x = '(' + str(np_regroup[start-1, 0].right) + ',' + str(np_regroup[end-1, 0].right)  + ']'
            else:
                x = '(' + str(np_regroup[start-1, 0]) + ',' + str(np_regroup[end-1, 0])  + ']'
        y = np.sum(np_regroup[start:end, 1])
        z = np.sum(np_regroup[start:end, 2])
        list_temp.append(x)
        list_pos.append(y)
        list_neg.append(z)
        if s == len(split_table) - 1:
            start = int(split_table[s])+1
            if varInterval:
                x = '>' + str(np_regroup[start-1, 0].right)
            else:
                x = '>' + str(np_regroup[start-1, 0])
            y = np.sum(np_regroup[start:, 1])
            z = np.sum(np_regroup[start:, 2])
            list_temp.append(x)
            list_pos.append(y)
            list_neg.append(z)
                    
    result_data['interval'] = list_temp
    result_data['flag_0'] = list_pos
    result_data['flag_1'] = list_neg
    
    return result_data

In [31]:
iv_split = IVNumericalSplit(dat4, 'ScanCount', 'Label', bins=7)

In [32]:
iv_split

Unnamed: 0,variable,interval,flag_0,flag_1
0,ScanCount,<=-4.0,6.0,97.0
1,ScanCount,"(-4.0,-3.0]",14.0,123.0
2,ScanCount,"(-3.0,-2.0]",64.0,738.0
3,ScanCount,"(-2.0,-1.0]",1027.0,13139.0
4,ScanCount,"(-1.0,1.0]",150083.0,407629.0
5,ScanCount,"(1.0,5.0]",22174.0,46088.0
6,ScanCount,>5.0,603.0,1140.0


In [33]:
# 上面ScanCount不是一个很好的例子，因为1的占比过大，分箱后箱子数量很少
# 这里换一个变量FinelinNumber进行分箱-合并-OneHot的说明（不过也不是一个好例子， 因为FinelineNumber不是一个数值型，强硬转换为float了）
dat5 = dat3[['FinelineNumber', 'Label']].copy()
dat5.loc[:, 'FinelineNumber'] = dat3.FinelineNumber.astype(float)

In [34]:
# 等宽分箱：把变量的取值范围分为k个等宽的区间
cut_by_width2, bin_bywidth2 = pd.cut(dat5['FinelineNumber'], bins=10, retbins=True, right=False) 
# 默认区间为左开有闭，这里设置right=False, 则分割区间为左币又开
bin_bywidth2

array([    0.   ,   999.8  ,  1999.6  ,  2999.4  ,  3999.2  ,  4999.   ,
        5998.8  ,  6998.6  ,  7998.4  ,  8998.2  , 10007.998])

In [35]:
# 等深分箱：把变量的取值个数等分为k个部分
cut_by_depth2, bin_bydepth2 = pd.qcut(dat5['FinelineNumber'], q=10, retbins=True)
bin_bydepth2

array([   0.,  276., 1025., 1703., 2602., 3352., 4005., 4900., 6268.,
       8101., 9998.])

In [36]:
dat5.loc[:, 'sc_bin_width'] = cut_by_width2
dat5.loc[:, 'sc_bin_depth'] = cut_by_depth2
dat5.head()

Unnamed: 0,FinelineNumber,Label,sc_bin_width,sc_bin_depth
0,1000.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]"
1,8931.0,0.0,"[7998.4, 8998.2)","(8101.0, 9998.0]"
2,4504.0,0.0,"[3999.2, 4999.0)","(4005.0, 4900.0]"
3,3565.0,0.0,"[2999.4, 3999.2)","(3352.0, 4005.0]"
4,1017.0,0.0,"[999.8, 1999.6)","(276.0, 1025.0]"


In [37]:
dat5.sc_bin_width.value_counts() # 等宽

[0.0, 999.8)           119685
[2999.4, 3999.2)       104785
[999.8, 1999.6)         92718
[3999.2, 4999.0)        68963
[1999.6, 2999.4)        66969
[4999.0, 5998.8)        44784
[8998.2, 10007.998)     41829
[6998.6, 7998.4)        37957
[7998.4, 8998.2)        32649
[5998.8, 6998.6)        32586
Name: sc_bin_width, dtype: int64

In [38]:
dat5.sc_bin_depth.value_counts() # 等深

(6268.0, 8101.0]    65668
(3352.0, 4005.0]    64735
(2602.0, 3352.0]    64535
(-0.001, 276.0]     64375
(1025.0, 1703.0]    64337
(1703.0, 2602.0]    64324
(276.0, 1025.0]     64253
(4900.0, 6268.0]    64093
(4005.0, 4900.0]    63699
(8101.0, 9998.0]    62906
Name: sc_bin_depth, dtype: int64

In [39]:
# 在分箱的基础上再进行合并，已减少箱子的数量
# 例如在等深分箱的基础上合并，这时候sc_bin_depth实际上已经是类别变量，但是是有序的，要进行有序合并
dat5.groupby('sc_bin_depth').Label.count() # 等深

sc_bin_depth
(-0.001, 276.0]     64375
(276.0, 1025.0]     64253
(1025.0, 1703.0]    64337
(1703.0, 2602.0]    64324
(2602.0, 3352.0]    64535
(3352.0, 4005.0]    64735
(4005.0, 4900.0]    63699
(4900.0, 6268.0]    64093
(6268.0, 8101.0]    65668
(8101.0, 9998.0]    62906
Name: Label, dtype: int64

In [40]:
ChiNumericalVarMerge(dat5, 'sc_bin_depth', 'Label', bins=5)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,"<=(276.0, 1025.0]",105440,23188
1,sc_bin_depth,"((276.0, 1025.0],(1703.0, 2602.0]]",90956,37705
2,sc_bin_depth,"((1703.0, 2602.0],(4005.0, 4900.0]]",128681,64288
3,sc_bin_depth,"((4005.0, 4900.0],(4900.0, 6268.0]]",49901,14192
4,sc_bin_depth,">(4900.0, 6268.0]",93976,34598


In [41]:
ChiNumericalVarMerge(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=1025.0,105440,23188
1,sc_bin_depth,"(1025.0,2602.0]",90956,37705
2,sc_bin_depth,"(2602.0,4900.0]",128681,64288
3,sc_bin_depth,"(4900.0,6268.0]",49901,14192
4,sc_bin_depth,>6268.0,93976,34598


In [42]:
PctNumericalVarMerge(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=1025.0,105440,23188
1,sc_bin_depth,"(1025.0,4900.0]",219637,101993
2,sc_bin_depth,"(4900.0,6268.0]",49901,14192
3,sc_bin_depth,"(6268.0,8101.0]",46584,19084
4,sc_bin_depth,>8101.0,47392,15514


In [43]:
IVNumericalVarMerge(dat5, 'sc_bin_depth', 'Label', bins=5, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=276.0,52046,12329
1,sc_bin_depth,"(276.0,1025.0]",53394,10859
2,sc_bin_depth,"(1025.0,2602.0]",90956,37705
3,sc_bin_depth,"(2602.0,4005.0]",85005,44265
4,sc_bin_depth,>4005.0,187553,68813


In [44]:
IVNumericalSplit(dat5, 'sc_bin_depth', 'Label', bins=7, varInterval=True)

Unnamed: 0,variable,interval,flag_0,flag_1
0,sc_bin_depth,<=276.0,12329.0,52046.0
1,sc_bin_depth,"(276.0,1025.0]",10859.0,53394.0
2,sc_bin_depth,"(1025.0,2602.0]",37705.0,90956.0
3,sc_bin_depth,"(2602.0,4900.0]",64288.0,128681.0
4,sc_bin_depth,"(4900.0,6268.0]",14192.0,49901.0
5,sc_bin_depth,"(6268.0,8101.0]",19084.0,46584.0
6,sc_bin_depth,>8101.0,15514.0,47392.0
