In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

In [2]:
# 本例数据来源：https://www.kaggle.com/c/walmart-recruiting-trip-type-classification/data
dat = pd.read_csv('./data/train.csv', dtype={'TripType':'category'
                                             ,'VisitNumber':object
                                             , 'Upc':object
                                             , 'FinelineNumber':object})

dat2 = dat.dropna(axis=0, how='any') # 去除空值

###  1. 种类变量之间有顺序关系:map | LabelEncoder | OrdinalEncoder

In [3]:
dat3 = dat2.copy() # 防止出现DataFrame设置值的Warning

weekday_mapper = {
    "Monday":1,
    "Tuesday":2,
    "Wednesday":3,
    "Thursday":4,
    "Friday":5,
    "Saturday":6,
    "Sunday":7
}

wm_map = dat2.Weekday.map(weekday_mapper) #用map可以随心指定顺序
dat3.loc[:,'Weekday_mapper'] = wm_map

# 可以用LabelEncoder或者OrdinalEncoder,但是转换后的数字是按类出现的顺序排序的
wm_le = LabelEncoder().fit_transform(dat2.Weekday)# 输入参数可以是一维数组
dat3.loc[:, 'Weekday_le'] = wm_le

wm_oe = OrdinalEncoder().fit_transform(dat2[['Weekday']]) # 输入参数必须是二维数组的形式
dat3.loc[:, 'Weekday_oe'] = wm_oe

In [4]:
dat3[[i for i in dat3.columns if 'Weekday' in i]].head()

Unnamed: 0,Weekday,Weekday_mapper,Weekday_le,Weekday_oe
0,Friday,5,0,0.0
1,Friday,5,0,0.0
2,Friday,5,0,0.0
3,Friday,5,0,0.0
4,Friday,5,0,0.0


### 2. 种类变量之间无顺序关系且种类数目较少： get_dummies 或者 OneHotEncoder

In [5]:
# 1. 使用pd.get_dummies
depart_by_dummies = pd.get_dummies(dat2.DepartmentDescription) 
depart_by_dummies2 = pd.get_dummies(dat2.DepartmentDescription
                        , drop_first=True)  # 丢去第一个变量,防止共线性

In [6]:
depart_by_dummies.head()

Unnamed: 0,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
depart_by_dummies2.head()

Unnamed: 0,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# 2. 使用OneHotEncoder
depart_by_ohe = OneHotEncoder(handle_unknown='ignore').fit_transform(dat2[["DepartmentDescription"]]).toarray() # 输入参数必须是二维数组的形式
pd.DataFrame(depart_by_ohe).head()

# from sklearn.compose import ColumnTransformer
# categorical_features = ['DepartmentDescription']
# cat = ColumnTransformer(transformers=[
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
# tmp2 = cat.fit_transform(dat2).toarray()
# pd.DataFrame(tmp2).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 将结果和原来数据集合并
dat3 = pd.concat([dat3, depart_by_dummies], axis=1)
# dat3 = pd.concat([dat3, pd.DataFrame(depart_by_ohe)], axis=1)
dat3.drop(columns=['DepartmentDescription'], inplace=True)
dat3.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,FinelineNumber,Weekday_mapper,Weekday_le,Weekday_oe,1-HR PHOTO,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,999,5,Friday,68113152929,-1,1000,5,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,30,7,Friday,60538815980,1,8931,5,0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
2,30,7,Friday,7410811099,1,4504,5,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,26,8,Friday,2238403510,2,3565,5,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,26,8,Friday,2006613744,2,1017,5,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. 种类变量之间无顺序关系且种类数目较多： 聚类后OneHot | 用WOE代替类别 | Embedding | Hash

#### 3.1 聚类后OneHot

In [10]:
# 先把问题转换为一个2分类的问题， 将TripType分为40和其他类型
dat4 = dat2.copy()
dat4.loc[dat4.TripType!='40', 'Label'] = 0
dat4.loc[dat4.TripType=='40', 'Label'] = 1
dat4.Label.value_counts()
dat4 = dat4[['DepartmentDescription', 'Label']]

In [11]:
dat4.head()

Unnamed: 0,DepartmentDescription,Label
0,FINANCIAL SERVICES,0.0
1,SHOES,0.0
2,PERSONAL CARE,0.0
3,PAINT AND ACCESSORIES,0.0
4,PAINT AND ACCESSORIES,0.0


In [12]:
m1 = dat4.groupby(['DepartmentDescription'])['Label'].count()
m1 = pd.DataFrame({'total':m1})
m2 = dat4.groupby(['DepartmentDescription'])['Label'].sum()
m2 = pd.DataFrame({'positive_class':m2})
mm = pd.merge(m1, m2, left_index=True, right_index=True, how='inner')
mm['negative_class'] = mm['total'] - mm['positive_class']
mm['positive_pct'] = mm['positive_class'] / mm['total']
mm = mm.drop(columns='total', axis=1)
mm = mm.reset_index()

In [13]:
mm.head()

Unnamed: 0,DepartmentDescription,positive_class,negative_class,positive_pct
0,1-HR PHOTO,1.0,393.0,0.002538
1,ACCESSORIES,115.0,1203.0,0.087253
2,AUTOMOTIVE,410.0,5004.0,0.07573
3,BAKERY,1693.0,5475.0,0.236189
4,BATH AND SHOWER,544.0,4038.0,0.118725


In [14]:
mm.shape

(68, 4)

#### 3.1.1 聚类方法：
#### 方法a) 绘制正样本占比的散点图，肉眼观察分几类合适；或者聚合正样本占比之差接近的类
#### 方法b) 使用聚类算法，例如KMeans
#### 方法c) 先按正样本率从小到大进行排序，再利用Fisher精准检验或者卡方检验合并相近的类别
#### 方法d) 先按正样本率从小到大进行排序，先将所有类别视为一类，利用IV(如果是多分类使用一致性比率)找出最优的二元分割，分为2组，后续不断迭代，直至分裂后的组数达到预先设定的个数

#### 参考BLog：http://blog.sina.com.cn/s/blog_13ec1876a0102x5j3.html

In [15]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(15, 1)) 
x = mm.positive_pct.values
y = [0]*mm.shape[0]
ax.scatter(x, y, marker='o')

<matplotlib.collections.PathCollection at 0x1a24000588>

In [16]:
#mm.sort_values(by='positive_pct')['positive_pct'].values

In [17]:
# a) 观察散点图并进行分类
intervals = [0.03, 0.13, 0.15, 0.20, 0.30, 0.40]

for i in range(len(intervals)+1):
    if i == 0:
        mm.loc[mm.positive_pct<intervals[i], 'GroupByScatter'] = i
    elif i == len(intervals):
        mm.loc[mm.positive_pct>=intervals[i-1], 'GroupByScatter'] = i
    else:
        mm.loc[(mm.positive_pct>=intervals[i-1])&(mm.positive_pct<intervals[i]), 'GroupByScatter'] = i
        
scatter_group = dict()
for i in mm['GroupByScatter'].unique():
    scatter_group[i] = list(mm.loc[(mm['GroupByScatter']==i), 'DepartmentDescription'].values)

In [18]:
mm.groupby(['GroupByScatter'])['DepartmentDescription'].count()

GroupByScatter
0.0     9
1.0    32
2.0     6
3.0     5
4.0     5
5.0     7
6.0     4
Name: DepartmentDescription, dtype: int64

In [19]:
scatter_group[6]

['FROZEN FOODS', 'GROCERY DRY GOODS', 'PRE PACKED DELI', 'SEAFOOD']

In [20]:
def PctCategoryVarMerge(df, variable, flag, bins=10, sample=None):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup['positive_pct'] = regroup['positive_class']/regroup['total_num']
    regroup.sort_values(by='positive_pct', inplace=True)
    regroup = regroup.drop(['total_num','positive_pct'], axis=1) 
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
        
    # 对相邻两个区间的正样本率组内正样本数/组内总样本数) 之差的绝对值进行计算（
    pctdiff_table = np.array([]) # 
    for i in np.arange(np_regroup.shape[0] - 1):
        pctdiff = abs(np_regroup[i+1, 1]/(np_regroup[i+1, 1] + np_regroup[i+1, 2]) 
                   - np_regroup[i, 1]/(np_regroup[i, 1] + np_regroup[i, 2]))
        pctdiff_table = np.append(pctdiff_table, pctdiff)
              
    # 把正样本率之差的绝对值最小的两个区间合并, 直到分箱数量小于设定的分箱数量bins
    while(1):
        if len(pctdiff_table) <= (bins-1):
            break
        pctdiff_min_index = np.argwhere(pctdiff_table == min(pctdiff_table))[0] # 找出正样本率之差的绝对值最小的索引位置
        np_regroup[pctdiff_min_index, 1] = np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index+1, 1] # 正样本合并
        np_regroup[pctdiff_min_index, 2] = np_regroup[pctdiff_min_index, 2] + np_regroup[pctdiff_min_index+1, 2] # 负样本合并
        np_regroup[pctdiff_min_index, 0] = np_regroup[pctdiff_min_index, 0] + '|' + np_regroup[pctdiff_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, pctdiff_min_index + 1, axis=0)
        
        # 更新正样本率之差 pctdiff_table
        if(pctdiff_min_index == np_regroup.shape[0] - 1): # 如果正样本率之差绝对值的最小值是最后两个区间的时候
            pctdiff_table[pctdiff_min_index - 1] = abs(np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]) 
                       - np_regroup[pctdiff_min_index-1, 1]/(np_regroup[pctdiff_min_index-1, 1] + np_regroup[pctdiff_min_index-1, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index, axis=0)

        elif(pctdiff_min_index == 0): # 如果正样本率之差绝对值的最小值是最前面两个区间的时候
            pctdiff_table[pctdiff_min_index] = abs(np_regroup[pctdiff_min_index+1, 1]/(np_regroup[pctdiff_min_index+1, 1] + np_regroup[pctdiff_min_index+1, 2]) 
                       - np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index+1, axis=0)
        else:
            pctdiff_table[pctdiff_min_index - 1] = abs(np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]) 
                       - np_regroup[pctdiff_min_index-1, 1]/(np_regroup[pctdiff_min_index-1, 1] + np_regroup[pctdiff_min_index-1, 2]))
            pctdiff_table[pctdiff_min_index] = abs(np_regroup[pctdiff_min_index+1, 1]/(np_regroup[pctdiff_min_index+1, 1] + np_regroup[pctdiff_min_index+1, 2]) 
                       - np_regroup[pctdiff_min_index, 1]/(np_regroup[pctdiff_min_index, 1] + np_regroup[pctdiff_min_index, 2]))
            pctdiff_table = np.delete(pctdiff_table, pctdiff_min_index+1, axis=0)
          
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
    
    result_data['interval'] = np_regroup[:, 0]
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [21]:
pct_merge = PctCategoryVarMerge(dat4, 'DepartmentDescription', 'Label', bins=7)

pct_groups = dict()
for i in range(pct_merge.shape[0]):
    v = pct_merge.interval.values[i].split('|')
    pct_groups[i] = v
    mm.loc[mm['DepartmentDescription'].isin(v), 'GroupByPct'] = i

In [22]:
# b) 使用KMeans进行分类
from sklearn.cluster import KMeans

estimator = KMeans(n_clusters=7)
estimator.fit(mm[['positive_pct']])
mm['GroupByKMeans'] = estimator.labels_

kmeans_group = dict()
for i in mm['GroupByKMeans'].unique():
    kmeans_group[i] = list(mm.loc[(mm['GroupByKMeans']==i), 'DepartmentDescription'].values)

In [23]:
mm.head()

Unnamed: 0,DepartmentDescription,positive_class,negative_class,positive_pct,GroupByScatter,GroupByPct,GroupByKMeans
0,1-HR PHOTO,1.0,393.0,0.002538,0.0,0.0,5
1,ACCESSORIES,115.0,1203.0,0.087253,1.0,0.0,2
2,AUTOMOTIVE,410.0,5004.0,0.07573,1.0,0.0,2
3,BAKERY,1693.0,5475.0,0.236189,4.0,3.0,4
4,BATH AND SHOWER,544.0,4038.0,0.118725,1.0,1.0,6


In [24]:
mm.groupby(['GroupByKMeans']).positive_pct.count()

GroupByKMeans
0     5
1     7
2    22
3     4
4     5
5     9
6    16
Name: positive_pct, dtype: int64

In [25]:
mm.groupby(['GroupByKMeans']).positive_pct.min()

GroupByKMeans
0    0.181433
1    0.324255
2    0.048218
3    0.427457
4    0.236150
5    0.000000
6    0.098514
Name: positive_pct, dtype: float64

In [26]:
mm.groupby(['GroupByKMeans']).positive_pct.max()

GroupByKMeans
0    0.196044
1    0.396029
2    0.096839
3    0.463609
4    0.286036
5    0.025290
6    0.148185
Name: positive_pct, dtype: float64

In [27]:
kmeans_group[6]

['BATH AND SHOWER',
 'BOOKS AND MAGAZINES',
 'GIRLS WEAR, 4-6X  AND 7-14',
 'HARDWARE',
 'HOME DECOR',
 'HOME MANAGEMENT',
 'HORTICULTURE AND ACCESS',
 'IMPULSE MERCHANDISE',
 'INFANT APPAREL',
 'LADIES SOCKS',
 'LAWN AND GARDEN',
 'OFFICE SUPPLIES',
 'PLUS AND MATERNITY',
 'SEASONAL',
 'SLEEPWEAR/FOUNDATIONS',
 'TOYS']

In [28]:
# c) 利用卡方检验合并
def ChiCategoryVarMerge(df, variable, flag, confidenceVal=3.841, bins=10, sample=None):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param confidenceVal: float|卡方临界值（3.841是自由度1置信度95%对应的临界值，小于此值说明组还可以合并)
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup['positive_pct'] = regroup['positive_class']/regroup['total_num']
    regroup.sort_values(by='positive_pct', inplace=True)
    regroup = regroup.drop(['total_num','positive_pct'], axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    # 处理连续没有正样本或负样本的区间，进行区间合并(如果连续区间的正样本或负样本数为0，则卡方计算的分母为0，所以要合并)
    i = 0 
    while(i <= np_regroup.shape[0] - 2):
        if((np_regroup[i,1] == 0 and np_regroup[i+1, 1]==0) 
           or (np_regroup[i, 2]==0 and np_regroup[i+1, 2] == 0)):
            np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i+1, 1] # 正样本合并
            np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i+1, 2] # 负样本合并
            np_regroup[i, 0] = np_regroup[i, 0] + '|' + np_regroup[i+1, 0] # 更新分箱变量范围
            np_regroup = np.delete(np_regroup, i+1, axis=0) # 删除整行
            i = i - 1
        i = i + 1
        
    # 对相邻两个区间进行卡方值计算(因为数值型变量大小已排序，算相邻即可)
    chi_table = np.array([]) # 保存项相邻两个区间的卡方值
    for i in np.arange(np_regroup.shape[0] - 1):
        chi = (np_regroup[i, 1] * np_regroup[i+1, 2] - np_regroup[i, 2] * np_regroup[i+1, 1])**2 * \
              (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i+1, 1] + np_regroup[i+1, 2]) / \
              ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i+1, 1] + np_regroup[i+1, 2]) * \
               (np_regroup[i, 1] + np_regroup[i+1, 1]) * (np_regroup[i, 2] + np_regroup[i+1, 2]))
        chi_table = np.append(chi_table, chi)
    
    # 把卡方值最小的两个区间合并, 直到分箱数量小于等于设定的分箱数量bins且相邻区间的卡方值没有大于设定的临界值confidenceVal的组了
    while(1):
        if(len(chi_table) <= (bins-1) and min(chi_table) >= confidenceVal):
            break
        chi_min_index = np.argwhere(chi_table == min(chi_table))[0] # 找出卡方值最小的索引位置
        np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1] # 正样本合并
        np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2] # 负样本合并
        np_regroup[chi_min_index, 0] = np_regroup[chi_min_index, 0] + '|' + np_regroup[chi_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, chi_min_index + 1, axis=0)
        
        # 更新卡方值表 chi_table
        if(chi_min_index == np_regroup.shape[0] - 1): # 如果卡方最小值是最后两个区间的时候
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index-1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index-1, 2] * np_regroup[chi_min_index, 1])**2 * \
                  (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                  ((np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * \
                   (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 2]))      
            chi_table = np.delete(chi_table, chi_min_index, axis=0)
        elif(chi_min_index == 0): # 如果卡方最小值是最前面两个区间的时候
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index+1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index+1, 1])**2 * \
                  (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) / \
                  ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) * \
                   (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2]))      
            chi_table = np.delete(chi_table, chi_min_index+1, axis=0)
        else:
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index-1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index-1, 2] * np_regroup[chi_min_index, 1])**2 * \
                  (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                  ((np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index-1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * \
                   (np_regroup[chi_min_index-1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index-1, 2] + np_regroup[chi_min_index, 2]))  
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index+1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index+1, 1])**2 * \
                  (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) / \
                  ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index+1, 1] + np_regroup[chi_min_index+1, 2]) * \
                   (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index+1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index+1, 2]))      
            chi_table = np.delete(chi_table, chi_min_index+1, axis=0)
        
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
        
    result_data['interval'] = np_regroup[:, 0]
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [29]:
chi_merge = ChiCategoryVarMerge(dat4, 'DepartmentDescription', 'Label', bins=7, sample=None)

chi_groups = dict()
for i in range(chi_merge.shape[0]):
    v = chi_merge.interval.values[i].split('|')
    chi_groups[i] = v
    mm.loc[mm['DepartmentDescription'].isin(v), 'GroupByChiSquare'] = i

In [30]:
mm.head()

Unnamed: 0,DepartmentDescription,positive_class,negative_class,positive_pct,GroupByScatter,GroupByPct,GroupByKMeans,GroupByChiSquare
0,1-HR PHOTO,1.0,393.0,0.002538,0.0,0.0,5,0.0
1,ACCESSORIES,115.0,1203.0,0.087253,1.0,0.0,2,0.0
2,AUTOMOTIVE,410.0,5004.0,0.07573,1.0,0.0,2,0.0
3,BAKERY,1693.0,5475.0,0.236189,4.0,3.0,4,3.0
4,BATH AND SHOWER,544.0,4038.0,0.118725,1.0,1.0,6,1.0


In [31]:
mm.GroupByChiSquare.value_counts()

0.0    38
1.0     9
2.0     5
3.0     5
6.0     4
4.0     4
5.0     3
Name: GroupByChiSquare, dtype: int64

In [32]:
def getMaxIVSplit(dat, pos_col, neg_col):     
    if dat.shape[0] <= 1:
        return None, None
    else:
        iv_list = np.array([]) 
        for i in range(dat.shape[0]-1):
            p1 = np.sum(dat[0:(i+1), pos_col])
            n1 = np.sum(dat[0:(i+1), neg_col])
            p2 = np.sum(dat[i+1:, pos_col])
            n2 = np.sum(dat[i+1:, neg_col])
            if (p1==0 or p2==0 or n1==0 or n2==0):
                iv_list = np.append(iv_list, -np.inf)
            else:
                iv1 = (p1/(p1+p2) - n1/(n1+n2)) * np.log((p1/(p1+p2))/(n1/(n1+n2)))
                iv2 = (p2/(p1+p2) - n2/(n1+n2)) * np.log((p2/(p1+p2))/(n2/(n1+n2)))
                iv_list = np.append(iv_list, iv1+iv2)
        
        iv_max = max(iv_list)
        
        iv_split_index = np.argwhere(iv_list == iv_max)[0]
        return iv_max, iv_split_index

In [33]:
# 4) 利用IV信息量不断拆分

def IVSplit(df, variable, flag, bins=10, sample=None):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup['positive_pct'] = regroup['positive_class']/regroup['total_num']
    regroup.sort_values(by='positive_pct', inplace=True)
    regroup = regroup.drop(['total_num','positive_pct'], axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    split_table = np.array([])
    for t in range(bins-1):
        iv_best = None
        iv_best_index = None
        if t == 0:        
            iv_best, iv_best_index = getMaxIVSplit(np_regroup, pos_col=1, neg_col=2)
            split_table = np.append(split_table, iv_best_index)
        else:
            for s in range(len(split_table)):
                if s == 0:
                    start = 0
                    end = int(split_table[s])+1
                    iv_best, iv_best_index = getMaxIVSplit(np_regroup[start:end,:], pos_col=1, neg_col=2)
                elif s>=1 and s <= len(split_table) - 1:
                    start = int(split_table[s-1])+1
                    end = int(split_table[s])+1
                    iv_max, iv_split_index = getMaxIVSplit(np_regroup[start:end,:], pos_col=1, neg_col=2)
                    if iv_max is not None and iv_max > iv_best:
                        iv_best = iv_max
                        iv_best_index = iv_split_index + split_table[s-1] + 1
                # 如果是最后一个split        
                if s == len(split_table)-1:  
                    # 最后一段
                    start = int(split_table[s])+1
                    iv_max, iv_split_index = getMaxIVSplit(np_regroup[start:,:], pos_col=1, neg_col=2)
                    if iv_max is not None and iv_max > iv_best:
                        iv_best = iv_max
                        iv_best_index = iv_split_index + split_table[s] + 1
            split_table = np.append(split_table, iv_best_index)
            split_table = np.sort(split_table) # 每次加入后要重新排序
            
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * bins
    
    list_temp = []
    list_pos = []
    list_neg = []
    for s in range(len(split_table)):
        if s == 0:
            start = 0 
            end = int(split_table[s])+1
        elif s >= 1 and s <= len(split_table) - 1:
            start = int(split_table[s-1])+1
            end = int(split_table[s])+1
        x = '|'.join(np_regroup[start:end, 0])
        y = np.sum(np_regroup[start:end, 1])
        z = np.sum(np_regroup[start:end, 2])
        list_temp.append(x)
        list_pos.append(y)
        list_neg.append(z)
        if s == len(split_table) - 1:
            start = int(split_table[s])+1
            x = '|'.join(np_regroup[start:, 0])
            y = np.sum(np_regroup[start:, 1])
            z = np.sum(np_regroup[start:, 2])
            list_temp.append(x)
            list_pos.append(y)
            list_neg.append(z)
                    
    result_data['interval'] = list_temp
    result_data['flag_0'] = list_pos
    result_data['flag_1'] = list_neg
    
    return result_data

In [34]:
iv_split = IVSplit(dat4, 'DepartmentDescription', 'Label', bins=7, sample=None)

iv_split_groups = dict()
for i in range(iv_split.shape[0]):
    v = iv_split.interval.values[i].split('|')
    iv_split_groups[i] = v
    mm.loc[mm['DepartmentDescription'].isin(v), 'GroupByIVSplit'] = i

In [35]:
mm.head()

Unnamed: 0,DepartmentDescription,positive_class,negative_class,positive_pct,GroupByScatter,GroupByPct,GroupByKMeans,GroupByChiSquare,GroupByIVSplit
0,1-HR PHOTO,1.0,393.0,0.002538,0.0,0.0,5,0.0,0.0
1,ACCESSORIES,115.0,1203.0,0.087253,1.0,0.0,2,0.0,2.0
2,AUTOMOTIVE,410.0,5004.0,0.07573,1.0,0.0,2,0.0,2.0
3,BAKERY,1693.0,5475.0,0.236189,4.0,3.0,4,3.0,5.0
4,BATH AND SHOWER,544.0,4038.0,0.118725,1.0,1.0,6,1.0,3.0


In [36]:
def IVCategoryVarMerge(df, variable, flag, bins=10, sample=None):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要分箱的列名（本函数适用对数值类型的列进行分箱）
    param flag:          str|正负样本标识的列名
    param bins:          int|分箱数量（要求分箱后数量小于等于此值）
    param sample:        int|抽样数目，默认不进行抽样
    """
    
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数   
    regroup['positive_pct'] = regroup['positive_class']/regroup['total_num']
    regroup.sort_values(by='positive_pct', inplace=True)
    regroup = regroup.drop(['total_num','positive_pct'], axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率

    
    pos_class_totalcnt = regroup.positive_class.sum()
    neg_class_totalcnt = regroup.negative_class.sum()
    
    # 处理没有正样本或负样本的区间，进行区间合并(如果某个组的正样本或负样本数为0，则计算WOE会出现ln函数中分子或者分母为0的情况，无法计算)
    i = 0 
    while(i <= np_regroup.shape[0] - 1):
        if((np_regroup[i,1] == 0 or np_regroup[i, 2]==0)):
            if i == 0:
                np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i+1, 1] # 正样本合并
                np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i+1, 2] # 负样本合并
                np_regroup[i, 0] = np_regroup[i, 0] + '|' + np_regroup[i+1, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i+1, axis=0)
            else:
                np_regroup[i-1, 1] = np_regroup[i-1, 1] + np_regroup[i, 1] # 正样本合并
                np_regroup[i-1, 2] = np_regroup[i-1, 2] + np_regroup[i, 2] # 负样本合并
                np_regroup[i-1, 0] = np_regroup[i-1, 0] + '|' + np_regroup[i, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i, axis=0) # 删除整行
            i = i - 1
        i = i + 1
        
        
    # 对相邻两个区间进行iv之差计算(因为数值型变量大小已排序，算相邻即可)
    ivdiff_table = np.array([]) 
    for i in np.arange(np_regroup.shape[0] - 1):
        ivdiff = abs((np_regroup[i+1, 1]/pos_class_totalcnt-np_regroup[i+1, 2]/neg_class_totalcnt)*np.log((np_regroup[i+1, 1]/pos_class_totalcnt)/(np_regroup[i+1, 2]/neg_class_totalcnt)) -\
                     (np_regroup[i, 1]/pos_class_totalcnt-np_regroup[i, 2]/neg_class_totalcnt)*np.log((np_regroup[i, 1]/pos_class_totalcnt)/(np_regroup[i, 2]/neg_class_totalcnt)))
        ivdiff_table = np.append(ivdiff_table, ivdiff)
    
    # 把iv之差绝对值最小的两个区间合并, 直到分箱数量小于等于设定的分箱数量bins
    while(1):
        if len(ivdiff_table) <= (bins-1):
            break
        ivdiff_min_index = np.argwhere(ivdiff_table == min(ivdiff_table))[0] # 找出WOE之差绝对值最小的索引位置
        np_regroup[ivdiff_min_index, 1] = np_regroup[ivdiff_min_index, 1] + np_regroup[ivdiff_min_index+1, 1] # 正样本合并
        np_regroup[ivdiff_min_index, 2] = np_regroup[ivdiff_min_index, 2] + np_regroup[ivdiff_min_index+1, 2] # 负样本合并
        np_regroup[ivdiff_min_index, 0] = np_regroup[ivdiff_min_index, 0] + '|' + np_regroup[ivdiff_min_index+1, 0] # 更新分箱变量的范围
        
        np_regroup = np.delete(np_regroup, ivdiff_min_index + 1, axis=0)
        
        # 更新 ivdiff_table
        if(ivdiff_min_index == np_regroup.shape[0] - 1): # 如果iv之差的绝对值最小值是最后两个区间的时候
            ivdiff_table[ivdiff_min_index - 1] = abs((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index, axis=0)

        elif(ivdiff_min_index == 0): # 如果woe之差的绝对值最小值是最前面两个区间的时候
            ivdiff_table[ivdiff_min_index] = abs((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index+1, axis=0)

        else:
            ivdiff_table[ivdiff_min_index - 1] = abs((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index-1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index-1, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table[ivdiff_min_index] = abs((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index+1, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index+1, 2]/neg_class_totalcnt)).astype(float)) -\
                                 (np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt - np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt) * np.log(((np_regroup[ivdiff_min_index, 1]/pos_class_totalcnt)/(np_regroup[ivdiff_min_index, 2]/neg_class_totalcnt)).astype(float)))
            ivdiff_table = np.delete(ivdiff_table, ivdiff_min_index+1, axis=0)  

        
    # 保存结果
    result_data = pd.DataFrame()
    result_data['variable'] = [variable] * np_regroup.shape[0]
        
    result_data['interval'] = np_regroup[:, 0]
    result_data['flag_0'] = np_regroup[:, 2]
    result_data['flag_1'] = np_regroup[:, 1]
    
    return result_data

In [37]:
iv_merge = IVCategoryVarMerge(dat4, 'DepartmentDescription', 'Label', bins=7, sample=None)

iv_merge_groups = dict()
for i in range(iv_merge.shape[0]):
    v = iv_merge.interval.values[i].split('|')
    iv_merge_groups[i] = v
    mm.loc[mm['DepartmentDescription'].isin(v), 'GroupByIVMerge'] = i

In [38]:
mm.head()

Unnamed: 0,DepartmentDescription,positive_class,negative_class,positive_pct,GroupByScatter,GroupByPct,GroupByKMeans,GroupByChiSquare,GroupByIVSplit,GroupByIVMerge
0,1-HR PHOTO,1.0,393.0,0.002538,0.0,0.0,5,0.0,0.0,0.0
1,ACCESSORIES,115.0,1203.0,0.087253,1.0,0.0,2,0.0,2.0,1.0
2,AUTOMOTIVE,410.0,5004.0,0.07573,1.0,0.0,2,0.0,2.0,0.0
3,BAKERY,1693.0,5475.0,0.236189,4.0,3.0,4,3.0,5.0,5.0
4,BATH AND SHOWER,544.0,4038.0,0.118725,1.0,1.0,6,1.0,3.0,4.0


In [39]:
# mm.to_csv('category_split.csv', index=False)

In [40]:
# 对原数据集的列进行合并, 假设采用KMeans的分类
dat4_merge_group = pd.merge(dat4, mm[['DepartmentDescription', 'GroupByKMeans']], on='DepartmentDescription')

In [41]:
dat4_merge_group.GroupByKMeans.value_counts()

1    229142
3    102478
6     84900
2     84399
4     72567
0     66544
5      2895
Name: GroupByKMeans, dtype: int64

#### 3.2 WOE编码：将类别转换为数字

In [42]:
def categoryToWOE(df, variable, flag, sample=None):
    """
    param df:            DataFrame|数据集
    param varialbe:      str|数据集中需要转换的列名
    param flag:          str|正负样本标识的列名
    param sample:        int|抽样数目，默认不进行抽样
    """
    # 判断是否需要抽样操作
    if sample is not None:
        df = df.sample(n=sample)
        
    # 对数据进行预处理
    total_num = df.groupby([variable])[flag].count() # 分箱变量每个值的数目
    total_num = pd.DataFrame({'total_num':total_num})
    positive_class = df.groupby([variable])[flag].sum() # 分箱变量每个值的正样本数
    positive_class = pd.DataFrame({'positive_class':positive_class})
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, how='inner')
    regroup.reset_index(inplace=True) # groupby处理之后正好是按数值型变量大小排序的
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 分箱变量每个值的负样本数
    regroup['positive_pct'] = regroup['positive_class']/regroup['total_num']
    regroup.sort_values(by='positive_pct', inplace=True)
    regroup = regroup.drop(['total_num','positive_pct'], axis=1) # 去除total_num,方便后面计算卡方值(用到不同组的正样本数和负样本数)
    np_regroup = np.array(regroup) # 把DataFrame转换为numpy，提高运行效率
    
    pos_class_totalcnt = regroup.positive_class.sum()
    neg_class_totalcnt = regroup.negative_class.sum()
    
    # 处理没有正样本或负样本的区间，进行区间合并(如果某个组的正样本或负样本数为0，则计算WOE会出现ln函数中分子或者分母为0的情况，无法计算)
    i = 0 
    while(i <= np_regroup.shape[0] - 1):
        if((np_regroup[i,1] == 0 or np_regroup[i, 2]==0)):
            if i == 0:
                np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i+1, 1] # 正样本合并
                np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i+1, 2] # 负样本合并
                np_regroup[i, 0] = np_regroup[i, 0] + '|' + np_regroup[i+1, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i+1, axis=0)
            else:
                np_regroup[i-1, 1] = np_regroup[i-1, 1] + np_regroup[i, 1] # 正样本合并
                np_regroup[i-1, 2] = np_regroup[i-1, 2] + np_regroup[i, 2] # 负样本合并
                np_regroup[i-1, 0] = np_regroup[i-1, 0] + '|' + np_regroup[i, 0] # 更新分箱变量范围
                np_regroup = np.delete(np_regroup, i, axis=0) # 删除整行
            i = i - 1
        i = i + 1
        
    np_regroup_df = pd.DataFrame(np_regroup, columns = [variable, 'poscnt', 'negcnt'])
    np_regroup_df['poscnt_pct'] = np_regroup_df['poscnt']/pos_class_totalcnt
    np_regroup_df['negcnt_pct'] = np_regroup_df['negcnt']/neg_class_totalcnt
    tmp = np_regroup_df['poscnt_pct']/np_regroup_df['negcnt_pct']
    np_regroup_df['WOE'] = tmp.apply(lambda x: np.log(x))
    # np_regroup_df['IV'] = (np_regroup_df['poscnt_pct'] - np_regroup_df['negcnt_pct']) * np_regroup_df['WOE']

    variable_woe = dict()
    for item in np_regroup_df[[variable, 'WOE']].values:
        key = item[0].split('|')
        value = item[1]
        for k in key:
            variable_woe[k] = value
            
    variable_woe_df = pd.DataFrame([variable_woe]).T
    variable_woe_df = variable_woe_df.reset_index()
    variable_woe_df.columns = [variable, 'WOE']
    
    return variable_woe_df 

In [43]:
woe = categoryToWOE(dat4, 'DepartmentDescription', 'Label')

In [44]:
processed = pd.merge(dat4, woe, on='DepartmentDescription', how='inner')

In [45]:
processed.head()

Unnamed: 0,DepartmentDescription,Label,WOE
0,FINANCIAL SERVICES,0.0,-1.978945
1,FINANCIAL SERVICES,0.0,-1.978945
2,FINANCIAL SERVICES,0.0,-1.978945
3,FINANCIAL SERVICES,0.0,-1.978945
4,FINANCIAL SERVICES,0.0,-1.978945


#### 3.3 Embedding (TODO)

#### 3.4 Hash (TODO)