https://mp.weixin.qq.com/s?__biz=MzIwOTc2MTUyMg==&mid=2247496832&idx=3&sn=692d498b8c074c8b528a280afe8ef89c&chksm=976c511da01bd80becff8e852c316ae386b7915aa64eac6c351849b517457d5ee1096c05e4df&scene=21#wechat_redirect

#### 1.2 Static Categorical Variables 静态类别变量

In [1]:
# 1.2.1 Ordinal Encoding 序数编码 序数编码将类别变量转化为一列序数变量，包含从1到类别数量的整数
import numpy as np
import pandas as pd
from category_encoders import OrdinalEncoder

In [2]:
# 随机生成一些训练集
train_set = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
                       ['female',20],['female',15]]),
             columns = ['Sex','Type'])
train_y = np.array([False, True, True, False, False])
# 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
test_set = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
                       ['male',20],['female',40], ['male', 25]]),
             columns = ['Sex','Type'])
test_set.loc[4,'Type'] = np.nan

In [5]:
encoder = OrdinalEncoder(cols=['Sex','Type'],handle_unknown='value',handle_missing='value').fit(train_set,train_y)
# 将 handle_unknown设为‘value’，即测试集中的未知特征值将被标记为-1
# 将 handle_missing设为‘value’，即测试集中的缺失值将被标记为-2
# 其他的选择为：‘error’：即报错；‘return_nan’：即未知值/缺失之被标记为nan 
encoded_train = encoder.transform(train_set)
encoded_test = encoder.transform(test_set)

In [10]:
# 在序数编码中：
  
# 变量Sex中: 'male' => 1.0, 'female' => 2.0, 未知 => -1.0, 缺失值 => -2.0
# （事实上，测试集中完全有可能出现未知与缺失情况）
# 在我们的例子中, Sex这一变量中的'other' 类别从未在训练集中出现过
  
# 变量 Type 中: 10 => 1.0, 20 => 2.0, 15 => 3.0, 未知 => -1.0, 缺失值 => -2.0
# 因为在训练集中未出现25，所以测试集中的25被标记为未知

In [11]:
encoded_train.astype(float)

Unnamed: 0,Sex,Type
0,1.0,1.0
1,2.0,2.0
2,1.0,1.0
3,2.0,2.0
4,2.0,3.0


In [12]:
# 1.2.2 One-hot Encoding 独热编码

In [13]:
# 1.2.3 Hashing Encoding 哈希编码
'''
哈希编码基于特征哈希的方法。它将哈希函数应用于变量，将任意数量的变量以一定的规则映射到给定数量的变量。
特征哈希可能会导致要素之间发生冲突。但哈希编码的优点是它不需要制定和维护原变量与新变量之间的映射关系。
因此，哈希编码器的大小及复杂程度不随数据类别的增多而增多。
'''
import numpy as np
import pandas as pd
from category_encoders.hashing import HashingEncoder
# category_encoders 直接支持dataframe

# 随机生成一些训练集
train_set = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
                       ['female',20],['female',15]]),
             columns = ['Sex','Type'])
train_y = np.array([False, True, True, False, False])

# 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
test_set = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
                       ['male',20],['female',40], ['male', 25]]),
             columns = ['Sex','Type'])
test_set.loc[4,'Type'] = np.nan

In [None]:
encoder = HashingEncoder(cols=['Sex','Type'],n_components=5).fit(train_set,train_y)
encoded_train = encoder.transform(train_set) # 转换训练集
encoded_test = encoder.transform(test_set) # 转换测试集
# 将两列的数据集哈希编码为5列
# 哈希编码结果与训练集/测试集中的内容无关
# 只要列名匹配，我们就可以在任何新数据集上使用哈希编码方法
# 编码结果仅由哈希函数确定

# 通常哈希编码应用于更高和更稀疏的维空间，这里以两个变量作为哈希编码的例子
# 以测试集结果为例

In [1]:
# 1.2.4 Helmert Encoding Helmert编码
'''
Helmert编码通常在计量经济学中使用。在Helmert编码（分类特征中的每个值对应于Helmert矩阵中的一行）之后，
线性模型中编码后的变量系数可以反映在给定该类别变量某一类别值的情形下因变量的平均值与给定该类
别其他类别值的情形下因变量的平均值的差值。在category_encoders包中实现的Helmert编码为
反向Helmert编码
'''
import numpy as np
import pandas as pd
from category_encoders import HelmertEncoder
# category_encoders 直接支持dataframe

# 随机生成一些训练集
train_set = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
                       ['female',20],['female',15]]),
             columns = ['Sex','Type'])
train_y = np.array([False, True, True, False, False])

# 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
test_set = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
                       ['male',20],['female',40], ['male', 25]]),
             columns = ['Sex','Type'])
test_set.loc[4,'Type'] = np.nan

In [3]:
encoder = HelmertEncoder(cols=['Sex', 'Type'],handle_unknown='indicator',handle_missing='indicator').fit(train_set,train_y)
encoded_train = encoder.transform(train_set) # 转换训练集
encoded_test = encoder.transform(test_set) # 转换测试集
# 以测试集结果为例
encoded_test

# 在Helmert编码中：
  
# 变量 Sex => 变为了4个新变量（包含常数项）: 'male' => [ 1. -1. -1. -1.];
#                                      'female' => [ 1.  1. -1. -1.];
#                                      未知 =>  [ 1.  0.  0.  3.];
#                                      缺失 => [ 1.  0.  2. -1.];
    
# 变量 Type => 变为了5个新变量（包含常数项）: 10 => [ 1. -1. -1. -1. -1.];
#                                        20 => [ 1.  1. -1. -1. -1.];, 
#                                        15 => [ 1.  0.  2. -1. -1.];
#                                        未知 =>  [ 1.  0.  0.  0.  4.];
#                                        缺失 => [ 1.  0.  0.  3. -1.];

Unnamed: 0,intercept,Sex_0,Sex_1,Sex_2,Type_0,Type_1,Type_2,Type_3
0,1,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
1,1,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
2,1,0.0,0.0,3.0,0.0,2.0,-1.0,-1.0
3,1,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
4,1,1.0,-1.0,-1.0,0.0,0.0,3.0,-1.0
5,1,-1.0,-1.0,-1.0,0.0,0.0,0.0,4.0


In [4]:
# 1.2.5 Sum(Deviation)Encoding 偏差编码
'''
偏差编码也通常在计量经济学中被使用。偏差编码后，
线性模型的系数可以反映该给定该类别变量值的情况下因变量的平均值与全局因变量的平均值的差异
'''
import numpy as np
import pandas as pd
from category_encoders.sum_coding import SumEncoder
# category_encoders 直接支持dataframe

# 随机生成一些训练集
train_set = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
                       ['female',20],['female',15]]),
             columns = ['Sex','Type'])
train_y = np.array([False, True, True, False, False])

# 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
test_set = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
                       ['male',20],['female',40], ['male', 25]]),
             columns = ['Sex','Type'])
test_set.loc[4,'Type'] = np.nan

In [5]:
encoder = SumEncoder(cols=['Sex', 'Type'], 
                     handle_unknown='indicator', 
                     handle_missing='indicator').fit(train_set,train_y) # 在训练集上训练
encoded_train = encoder.transform(train_set) # 转换训练集
encoded_test = encoder.transform(test_set) # 转换测试集
# 将 handle_unknown设为‘indicator’，即会新增一列指示未知特征值
# 将 handle_missing设为‘indicator’，即会新增一列指示缺失值
# 其他的handle_unknown/handle_missing 的选择为：
# ‘error’：即报错; ‘return_nan’：即未知值/缺失之被标记为nan; ‘value’：即未知值/缺失之被标记为0

# 以测试集结果为例
encoded_test

# 在Helmert编码中：
  
# 变量 Sex => 变为了4个新变量（包含常数项）: 'male' => [ 1.  1.  0.  0.];
#                                      'female' => [ 1.  0.  1.  0.];
#                                       未知 =>  [ 1. -1. -1. -1.];
#                                       缺失 => [ 1.  0.  0.  1.];
    
# 变量 Type => 变为了5个新变量（包含常数项）: 10 => [ 1.  1.  0.  0.  0.];
#                                        20 => [ 1.  0.  1.  0.  0.];, 
#                                        15 => [ 1.  0.  0.  1.  0.];
#                                        未知 =>  [ 1. -1. -1. -1. -1.];
#                                        缺失 => [ 1.  0.  0.  0.  1.];

Unnamed: 0,intercept,Sex_0,Sex_1,Sex_2,Type_0,Type_1,Type_2,Type_3
0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,-1.0,-1.0,-1.0,0.0,0.0,1.0,0.0
3,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5,1,1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0


In [6]:
# 可以通过如下代码计算变量Type的Deviation 矩阵
from patsy.contrasts import Sum
levels = [1,2,3,4,5] # 3个变量值 + 1个未知值 + 1个缺失值
contrast = Sum().code_with_intercept(levels)
print(contrast.matrix) # 第一列为常数项

[[ 1.  1.  0.  0.  0.]
 [ 1.  0.  1.  0.  0.]
 [ 1.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  1.]
 [ 1. -1. -1. -1. -1.]]


In [7]:
# 1.2.6 TargetEncoding 目标编码
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
# category_encoders 直接支持dataframe

# 随机生成一些训练集
train_set = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
                       ['female',20],['female',15]]),
             columns = ['Sex','Type'])
train_y = np.array([False, True, True, False, False])

# 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
test_set = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
                       ['male',20],['female',40], ['male', 25]]),
             columns = ['Sex','Type'])
test_set.loc[4,'Type'] = np.nan

In [8]:
encoder = TargetEncoder(cols=['Sex','Type'], 
                        handle_unknown='value',  
                        handle_missing='value').fit(train_set,train_y) # 在训练集上训练
encoded_train = encoder.transform(train_set) # 转换训练集
encoded_test = encoder.transform(test_set) # 转换测试集

# handle_unknown 和 handle_missing 被设定为 'value'
# 在目标编码中，handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定
# 两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值

encoded_test # 编码后的变量数与原类别变量数一致

Unnamed: 0,Sex,Type
0,0.34128,0.473106
1,0.473106,0.473106
2,0.4,0.4
3,0.473106,0.473106
4,0.34128,0.4
5,0.473106,0.4


In [9]:
# 验证一下计算的结果，在测试集中，‘male’类别的编码值为 0.473106
prior = train_y.mean() # 先验概率
min_samples_leaf = 1.0 # 默认为1.0
smoothing = 1.0 # 默认为1.0
n = 2 # 训练集中，两个样本包含‘male’这个标签
n_positive = 1 # 在训练集中，这两个包含‘male’标签的样本中仅有一个有正的因变量标签

𝑠𝑚𝑜𝑜𝑣𝑒 = 1 / (1 + np.exp(-(n - min_samples_leaf) / smoothing))
male_encode = prior * (1-𝑠𝑚𝑜𝑜𝑣𝑒) + 𝑠𝑚𝑜𝑜𝑣𝑒 * n_positive/n
male_encode # return 0.4731058578630005，与要验证的值吻合

0.4731058578630005