In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
import sklearn.tree as tree
def split_cat_num(data, cat=15):
    '''对特征进行分类，得到数值特征和类别特征，对于数值特征中取值较少的特征，将其
    归为类别特征中。'''
    categorical = data.select_dtypes(include='object')
    numerical = data.select_dtypes(exclude='object')
    nunique = numerical.nunique().sort_values(ascending=True)
    n_index = nunique[nunique<cat].index
    num = numerical.columns.difference(n_index)
    category = categorical.columns
    return category, num, n_index

In [3]:
import os
os.chdir(r'C:\test\data\santander-customer-transaction-prediction')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y = train.pop('target')
_ = train.pop('ID_code')
_ = test.pop('ID_code')

#### 无监督分箱
数值特征进行分箱
- 等频分箱
- 等距分箱

In [4]:
class UnsupervisedSplit():
    '''对连续值进行无监督的分箱操作，分为等宽和等频，types控制分箱的类型，取值为
    len:等宽，默认为等宽， freq:等频；bins控制分箱的数目。默认为10'''
    def __init__(self, bins=10, types='freq'):
        self.bins = bins
        self.types = types
        self.mapping = {}
        self.dics = {}
    def equalfreq(self,features): #等频
        result = features.copy()
        for each in features.columns:
            # If bin edges are not unique, raise ValueError or drop non-uniques.
            res = pd.qcut(features[each], q=self.bins, duplicates='drop')
            # 得到切分点
            splitpoint = res.cat.categories.left.tolist()
            splitpoint.append(np.inf)
            splitpoint[0] = -np.inf
            self.mapping[each] = splitpoint
            # 使用自然数编码。
            dic = {key:value for value,key in enumerate(res.cat.categories)}
            result[each] = res.cat.rename_categories(dic)
            self.dics[each]=dic
        return result
    
    def equallen(self, features): # 等宽
        result = features.copy()
        for each in features.columns:
            res = pd.cut(features[each], bins=self.bins, 
                         duplicates='drop')
            splitpoint = res.cat.categories.left.tolist()
            # 防止测试集出现比训练集更小或更大的值
            splitpoint.append(np.inf)
            splitpoint[0] = -np.inf
            self.mapping[each] = splitpoint
            dic = {key:value for value,key in enumerate(res.cat.categories)}
            result[each] = res.cat.rename_categories(dic)
            self.dics[each]=dic
        return result
        
    def __call__(self, features,y=None):
        if y is None: # 对测试集进行分箱
            result = features.copy()
            for each in features.columns:
                res = pd.cut(features[each],
                      bins=self.mapping[each], duplicates='drop')
                dic = {key:value for value,key in enumerate(res.cat.categories)}
                result[each] = res.cat.rename_categories(dic)
            return result
        else:
            if self.types == 'len':
                return self.equallen(features)
            else:
                return self.equalfreq(features)

In [5]:
unsplitor = UnsupervisedSplit()
X = train.iloc[:5000,:10]
X_test = test.iloc[:5000,:10]
y1 = y[:5000]
X1 = unsplitor(X,y1)
X1.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,2,1,6,2,5,3,3,7,0,0
1,6,2,8,2,7,9,6,4,7,6
2,2,4,6,6,3,3,9,3,0,1
3,5,4,2,5,8,6,6,3,0,6
4,4,5,7,4,7,8,7,7,9,5


In [7]:
X2 = unsplitor(X_test)
X2.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,5,9,8,8,5,6,6,6,6,7
1,2,7,6,2,1,5,7,7,0,1
2,0,0,4,5,3,9,3,8,6,6
3,2,5,6,4,0,8,3,8,7,4
4,6,6,9,6,1,3,9,0,7,3


#### 树分箱:gini和信息率
数值特征进行分箱

In [37]:
class TreeSplit():
    def __init__(self, bins=10, types='gini'):
        self.bins=bins
        self.types = types 
        self.mapping = {}
        self.dics={}
    def gini(self, features, y):
        result = features.copy()
        for each in features.columns:
            # 建立树模型并进行训练
            estimator = tree.DecisionTreeClassifier(criterion=self.types, 
                                                max_leaf_nodes=self.bins)
            X = features[each].values[:,None]
            estimator.fit(X, y)
            # 得到每个切分节点的阈值
            threshold = estimator.tree_.threshold
            thre = threshold[threshold!=-2]
            thre.sort()
            # 得到树模型的切分点
            bins_ = np.zeros(self.bins+1)
            bins_[0] = -np.inf
            bins_[1:-1] = thre
            bins_[-1] = np.inf
            # 对连续值进行切分
            self.mapping[each] = bins_
            res = pd.cut(features[each], bins=bins_, 
                                  duplicates='drop')
            # 使用自然数编码
            dic = {key:value for value,key in enumerate(res.cat.categories)}
            result[each] = res.cat.rename_categories(dic)
            self.dics[each]=dic
        return result
    def __call__(self,features, y=None):
        if y is None:
            result = features.copy()
            for each in features.columns:
                res = pd.cut(features[each],
                        bins=self.mapping[each],duplicates='drop')
                result[each] = res.cat.rename_categories(self.dics[each])
            return result
        else: 
            return self.gini(features, y)

In [38]:
tsplit = TreeSplit()
tX = tsplit(X,y1)
tX.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,7,0,0,0,6,0,2,0,0,4
1,7,1,4,0,6,7,2,0,0,8
2,7,5,0,5,0,0,4,0,0,4
3,7,7,0,3,6,2,2,0,0,8
4,7,7,4,3,6,5,2,0,9,8


In [40]:
tX_test = tsplit(X_test)
tX_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,7,9,4,9,6,2,2,0,0,8
1,7,7,0,0,0,2,2,0,0,4
2,0,0,0,3,0,7,2,0,0,8
3,7,7,0,3,0,5,2,2,0,6
4,7,7,4,3,0,0,4,0,0,6


In [41]:
tX2 = tsplit(X)
(tX==tX2).all()

var_0    True
var_1    True
var_2    True
var_3    True
var_4    True
var_5    True
var_6    True
var_7    True
var_8    True
var_9    True
dtype: bool

#### WOE分箱
用于对数值特征进行分箱
- `split`:对数值特征采用等频切分，并保证每个分箱中都包含好坏样本
- `woe`：计算特征的每个分箱的woe值
- `merge`:在不满足woe值与分箱单调的条件下，对相邻分箱中woe值最接近的进行合并，直到满足终止条件。
- `WOEencoding`：对分箱后的分箱进行woe编码
- `compute_iv`：计算特征的IV值。

In [42]:
import scipy.stats as stats
import pandas as pd
class WOESplit:
    def __init__(self,bins=10):
        self.bins = bins
        self.mapping = { }
        self.woemapping = { }
    def split(self,X,Y,splits=30):# 保证分箱之后每个箱内都包含正负样本。
        flag=True
        while flag:
            d1 = pd.DataFrame({"X": X, "Y": Y, 
                               "Bucket": pd.qcut(X, splits, 
                                        duplicates='drop')})
            d2 = d1.groupby('Bucket')['Y'].mean()
            d2 = d2.fillna(0)
            flag = d2.isin([0,1]).any()
            splits = d1['Bucket'].nunique()-1
        return d1
    
    def woe(self,d1): #计算特征的每一个分箱的woe值
        good = d1['Y'].sum()
        bad = d1['Y'].count()-good
        d2 = d1.groupby('Bucket')['Y'].mean()
        woe = np.log((d2/(1-d2))/(good/bad))
        return woe
    
    def merge(self,d1):  # 对单个特征进行woe分箱。
        unique = d1['Bucket'].cat.categories
        bins = len(unique)
        mapping = {key:value 
                   for key, value in zip(range(0,bins),unique)}
        re_map = {value:key 
                  for key, value in zip(range(0,bins),unique)}
        # 对分箱后的数据进行自然数编码
        d1['Bucket'] = d1['Bucket'].map(lambda x: re_map[x])
        mins=0
        while mins==0:  #将分箱后woe值相同的箱进行合并。
            d2 = d1.groupby('Bucket')['Y'].mean()
            diff = np.abs(d2-d2.shift(-1)).dropna()
            mins = diff.min()
            if mins == 0:
                idx = diff.idxmin()
                li = list(d1['Bucket'].cat.categories)
                item = li[li.index(idx)+1]
                d1.loc[d1['Bucket'].isin([idx]+[item]),'Bucket']=idx
                d1['Bucket'] = d1['Bucket'].cat.remove_unused_categories()
        r = 0
        while np.abs(r).round(2) < 1:
            woe = self.woe(d1)
            woe = woe.reset_index()
            r, p = stats.spearmanr(woe['Bucket'].values, 
                                   woe['Y'].values)
            if np.abs(r).round(2) < 1: #如果不是单调的
                diff = np.abs(woe['Y']-woe['Y'].shift(-1)).dropna()
                idx = woe.loc[diff.idxmin(),'Bucket'] #合并woe值最接近的分箱
                li = list(woe['Bucket'].cat.categories)
                item = li[li.index(idx)+1]
                d1.loc[d1['Bucket'].isin([idx]+[item]),'Bucket']=idx
                d1['Bucket'] = d1['Bucket'].cat.remove_unused_categories()
        # 获取切分点。
        index = d1['Bucket'].unique().sort_values()
        splitpoint=[]
        for i, each in enumerate(index):
            splitpoint.append(mapping[each].left)
        splitpoint.append(np.inf)
        splitpoint[0] = -np.inf
        return d1, splitpoint
    
    def WoeEncoding(self,features,y): #对特征进行woe分箱并woe编码
        result = features.copy()
        for each in features.columns:
            d1 = self.split(features[each],y)
            d2, splitpoint = self.merge(d1)
            self.mapping[each] = splitpoint
            woe = self.woe(d2).to_dict()
            self.woemapping[each] = woe
            result[each] = d2['Bucket'].map(lambda x:woe[x])
        return result
    
    def compute_iv(self,features,y): #计算特征的IV值
        category, num, n_index = split_cat_num(features, self.bins)
        result = features.copy( )
        result.loc[:,num] = self.WoeEncoding(result.loc[:,num],y)
        ivs=[]
        badT, goodT = y.sum(), features.shape[0] - y.sum()
        for each in result.columns:
            temp = pd.concat([result[each],y],axis=1)
            temp.columns=['A','B']
            group = temp.groupby('A',as_index=False)['B']
            df = group.count()
            df.rename(columns={'B':'total','A':'woe'},inplace=True) # 不同类别的总数
            df['bad'] = group.sum()['B'] # 坏样本数目
            df['good'] = df['total'] - df['bad'] # 好样本数目
            df['mul'] = df['bad']/badT - df['good']/goodT
            df['woe'] = df['woe'].astype(float)
            iv = (df['mul'] * df['woe']).sum()
            ivs.append(iv)
        return ivs
    
    def __call__(self,features,y=None):
        if y is None:  #对测试集进行woe分箱并赋值
            result = features.copy()
            for each in features.columns:
                result[each] = pd.cut(features[each], bins=self.mapping[each],
                                labels=self.woemapping[each].values())
            return result
        else:
            return self.WoeEncoding(features,y)

In [44]:
wsplitor = WOESplit()
wX = wsplitor(X,y1)
wX.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,0.023661,-0.277452,0.198409,0.051256,-0.066428,-0.039046,-0.060345,0.042902,0.016984,0.051085
1,0.023661,-0.277452,0.198409,0.051256,0.142504,0.602575,-0.060345,0.042902,0.016984,-0.040544
2,0.023661,0.118341,0.198409,0.051256,-0.066428,-0.039046,0.2908,-0.050932,0.016984,0.051085
3,0.023661,0.118341,-0.176458,0.051256,0.142504,-0.039046,-0.060345,-0.050932,0.016984,-0.040544
4,0.023661,0.118341,0.198409,0.051256,0.142504,-0.039046,-0.060345,0.042902,-0.638917,-0.040544


In [45]:
wX_test = wsplitor(X_test)
wX_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,0.023661,0.118341,0.198409,0.051256,-0.066428,-0.039046,-0.060345,0.042902,0.016984,-0.040544
1,0.023661,0.118341,0.198409,0.051256,-0.066428,-0.039046,-0.060345,0.042902,0.016984,0.051085
2,-0.099453,-0.277452,-0.176458,0.051256,-0.066428,0.602575,-0.060345,0.042902,0.016984,-0.040544
3,0.023661,0.118341,0.198409,0.051256,-0.066428,-0.039046,-0.060345,0.042902,0.016984,-0.040544
4,0.023661,0.118341,0.198409,0.051256,-0.066428,-0.039046,0.2908,-0.050932,0.016984,0.051085


In [47]:
wsplitor.compute_iv(X,y1)

[0.0023526778433860104,
 0.032744351591635794,
 0.03490922714832599,
 0.011754758872212116,
 0.00945872283462432,
 0.042711958216587194,
 0.03163812895932184,
 0.0021846939123763612,
 0.010841706340286519,
 0.002070800634256611]

In [48]:
wsplitor.mapping

{'var_0': [-inf, 8.027, inf],
 'var_1': [-inf, -3.619, inf],
 'var_2': [-inf, 11.018, inf],
 'var_3': [-inf, 4.896, inf],
 'var_4': [-inf, 12.031, inf],
 'var_5': [-inf, -18.593, 6.193, inf],
 'var_6': [-inf, 6.546, 7.067, inf],
 'var_7': [-inf, 16.2, inf],
 'var_8': [-inf, 5.902, inf],
 'var_9': [-inf, 7.419, inf]}

In [49]:
wsplitor.woemapping

{'var_0': {0: -0.09945325078340726, 6: 0.02366075629294781},
 'var_1': {0: -0.27745165162422647, 10: 0.11834082923964263},
 'var_2': {0: -0.1764584063419745, 17: 0.1984094626410883},
 'var_3': {0: -0.22955890704387402, 6: 0.05125602172648278},
 'var_4': {0: -0.06642756103637527, 21: 0.14250383862538568},
 'var_5': {0: -0.638917178946336,
  1: -0.03904593855192594,
  28: 0.6025746801555167},
 'var_6': {0: -0.060345210318993606,
  27: 0.29079962785628716,
  29: 0.7076697325784417},
 'var_7': {0: -0.0509321626933189, 14: 0.04290200130840181},
 'var_8': {0: 0.016984119949264107, 29: -0.638917178946336},
 'var_9': {0: 0.051084715516213984, 13: -0.040543595345029854}}

#### 卡方分箱
可以用来处理有序的类别特征和数值特征，无法处理无序的类别特征
- bins：分箱的数目
- confindenceVal:分箱的终止条件，卡方分布的90%的点
- is_split:保存特征是否可以进行切分，是否是满足条件的数值特征。

`split`：对数值特征采取等频切分，并记录是否可以切分，并对切分后的数据自然数编码

`chi`：计算相邻分箱的卡方值

`chimerge`：卡方分箱函数，在不满足终止条件的情况下，合并相邻分箱卡方值最小的。直到满足终止条件。

`binning`:利用卡方分箱之后的特征确定切分点，并将切分点保存，使用切分点对原数值特征进行切分，并采用自然数编码。

`code`:对非数值的有序类别特征进行编码。

In [50]:
class ChiMerge:
    def __init__(self, confidenceVal=3.841, bins=10):
        self.confidenceVal = confidenceVal
        self.bins = bins
        self.maps={ }
        self.splitpoints = { }
        self.is_split={ }
    def split(self, df, variable, flag): #对数值特征进行分箱
        df = df.copy()
        try:
            df[variable] = pd.qcut(df[variable], 30)
            self.mapping = {key:value for key, value in zip(range(0,30),
                                        df[variable].cat.categories)}
            self.is_split[variable]=True   
        except:
            items = df[variable].unique()
            items.sort()
            self.mapping = {key:value 
                       for key, value in zip(range(len(items)),items)}
            self.is_split[variable]=False
        remap = {key: value for value,key in self.mapping.items()}
        df[variable] = df[variable].map(lambda x:remap[x])
        return df.groupby(variable,as_index=False)
        
    def chi2(self, arr): #计算相邻分箱的卡方值
        a = arr.sum(axis=0)
        b = arr.sum(axis=1)
        demonitor = a.prod()*b.prod()
        nomitor = (arr[0,0]*arr[1,1]-arr[0,1]*arr[1,0])**2*arr.sum()
        return nomitor/demonitor
    
    def chimerge(self, df, variable, flag):
        group = self.split(df,variable,flag)
        # 统计每个箱内的信息
        regroup = group[flag].count()
        regroup.rename(columns={flag:'total_num'}, inplace=True)
        regroup['positive_class'] = group[flag].sum()[flag]
        regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']
        regroup = regroup.drop('total_num', axis=1)
        data = regroup.values
        i=0
        while i<= data.shape[0]-2:# 合并相邻类别为0的区间。
            temp = data[i:i+2,1:].sum(axis=0)
            if (temp[0]==0) or (temp[1]==0):
                data[i,1] = data[i,1] + data[i+1,1]
                data[i,2] = data[i,2] + data[i+1, 2]
                data[i,0] = data[i+1,0]
                data = np.delete(data, i+1, axis=0)
                i-=1
            i+=1
            
        table = np.array([])  # 创建一个数组保存相邻两个区间的卡方值
        for i in np.arange(data.shape[0] - 1):
            table = np.append(table, self.chi2(data[i:i+2,1:]))
        # 合并相邻卡方值最小的分箱，直到满足终止条件。
        while True:
            if len(table)<=(self.bins-1) or min(table) >= self.confidenceVal:
                break
            index = np.argmin(table)
            data[index, 1] = data[index, 1]+data[index + 1, 1]
            data[index, 2] = data[index, 2] + data[index + 1, 2]
            data[index, 0] = data[index + 1, 0]
            data = np.delete(data, index + 1, 0)
            if (index == data.shape[0] - 1): 
                table[index - 1] = self.chi2(data[index-1:,1:])
                table = np.delete(table, index, axis=0)
            elif index == 0:
                table[index] = self.chi2(data[index:index+2, 1:])
                table = np.delete(table, index+1, axis=0)
            else:
                table[index-1] = self.chi2(data[index-1:index+1, 1:])
                table[index] = self.chi2(data[index:index+2, 1:])
                table = np.delete(table, index + 1, axis=0)
        result = pd.DataFrame(data,columns=['label','good','bad']) 
        return result
    
    def binning(self,df, variable, flag):
        result = self.chimerge(df,variable,flag)
        if self.is_split[variable]: # 如果是数值特征
            indexs = result['label'].unique()
            points = [-np.inf]
            for each in indexs:
                points.append(self.mapping[each].left)
            points.append(np.inf)
            self.splitpoints[variable] = points
            res = pd.cut(df[variable],bins=points,
                                  labels=range(len(points)-1))
            return res
        else: # 如果为类别特征。
            indexs = list(result['label'].unique())
            points = pd.cut(range(len(self.mapping)),
                            bins=[-1]+indexs,labels=indexs)
            from collections import defaultdict
            maps = defaultdict(list)
            for i,point in zip(range(len(self.mapping)),points):
                maps[self.mapping[point]].append(self.mapping[i])
            self.maps[variable] = maps
            res = self.coding(df,variable)
            return res
        
    def coding(self, df, variable):
        def onemap(x):
            for key,value in self.maps[variable].items():
                if x in value:
                    return key
        return df[variable].map(onemap)
    
    def __call__(self,features,y=None):
        result = features.copy()
        if y is None:
            for each in features.columns:
                if self.is_split[each]:
                    result[each]=pd.cut(features[each],
                                bins=self.splitpoints[each],
                        labels=range(len(self.splitpoints[each])-1))
                else:
                    result[each] = self.coding(features, each)
        else:
            df = pd.concat([features,y],axis=1)
            for each in features.columns:
                result[each] = self.binning(df, each, y.name)
        return result

In [51]:
csplit = ChiMerge()

In [53]:
tX = csplit(X,y1)
tX.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,5,2,6,3,6,3,3,7,0,1
1,6,5,8,4,8,9,5,6,7,4
2,4,6,6,8,2,3,9,5,0,1
3,6,6,1,5,8,5,5,5,0,4
4,5,6,8,5,8,7,5,7,10,4


In [56]:
tX_test = csplit(X_test)
tX_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9
0,6,10,8,9,6,5,5,7,7,5
1,4,8,6,4,2,5,6,7,0,1
2,0,0,3,5,2,10,3,8,6,4
3,4,6,6,5,1,8,3,8,7,4
4,6,6,9,8,1,3,9,1,7,3


In [57]:
tX1 = csplit(X)
(tX1==tX).all()

var_0    True
var_1    True
var_2    True
var_3    True
var_4    True
var_5    True
var_6    True
var_7    True
var_8    True
var_9    True
dtype: bool