# 数据预处理

样本：每行表示一个样本  
特征：每列表示一个特征  

## 一、均值移除
把每个特征的平均值移除，保证处理后的特征均值为零，标准差为1，关注不同样本同一个特征的偏差

In [15]:
import numpy as np
import sklearn.preprocessing as sp
def std_scale(raw_samples):
    std_samples = raw_samples.copy()
    cols = std_samples.shape[1]
    for col in range(cols):
        col_samples = std_samples[:,col]
        col_mean = col_samples.mean()
        col_std = col_samples.std()
        col_samples -= col_mean
        col_samples /= col_std
    print(std_samples)
    return std_samples   

In [17]:
def main():
    raw_samples = np.array([
        [3,-1.5,2,-5.4],
        [0,4,-0.3,2.1],
        [1,3.3,-1.9,-4.3]
    ])
    #求每一列的平均值
    raw_means = raw_samples.mean(axis=0)
    print(raw_means)
    # std_samples = std_scale(raw_samples)
    # 调用sklearn中的函数
    std_samples = sp.scale(raw_samples)
    std_means = std_samples.mean(axis=0)
    print(std_means)
    std_stds = std_samples.std(axis=0)
    print(std_stds)
    return 0 

if __name__ == '__main__':
    main()
    

[ 1.33333333  1.93333333 -0.06666667 -2.53333333]
[ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]
[1. 1. 1. 1.]


## 二、范围缩放
* 把每个特征的最大值和最小值线性缩放到一个给定的范围  
原始样本  
x1　　　　　　　　y1  
x2　　　　　　　　y2  
x3－｛ｋ／ｂ｝－＞y3  
...　　　　　　　　...  
xn　　　　　　　　yn  
xmin * k + b = min    
xmax * k + b = max  
/ xmin 1 \ * / k \ = / min \  
\ xmax 1 /  &nbsp;\ b /  &nbsp; \ max /  
　　Ａ　　ｘ　　　ｂ  
x = np.linalg.lstsq(A,b)

In [31]:
import numpy as np
import sklearn.preprocessing as sp
def mmx_scale(raw_samples,min,max):
    mmx_samples = raw_samples.copy()
    cols = mmx_samples.shape[1]
    for col in range(cols):
        col_samples = mmx_samples[:,col]
        col_min = col_samples.min()
        col_max = col_samples.max()
        ## np.linalg.lstsq求解
        k,b = np.linalg.lstsq(
            np.array([[col_min,1],[col_max,1]]),
            np.array([min,max]))[0]
        col_samples *= k
        col_samples += b
    return mmx_samples

def main():
    raw_samples = np.array([
        [3,-1.5,2,-5.4],
        [0,4,-0.3,2.1],
        [1,3.3,-1.9,-4.3]
    ])
    mmx_samples = mmx_scale(raw_samples,0,1)
    print(mmx_samples)
    # MinMaxScaler范围缩放函数
    # 先生成一个scaler
    mmx = sp.MinMaxScaler(feature_range=(0,1))
    # 通过fit_transform函数进行转变
    mmx_samples = mmx.fit_transform(raw_samples)
    print(mmx_samples)
    return 0

if __name__ == '__main__':
    main()

[[ 1.00000000e+00 -2.77555756e-16  1.00000000e+00 -2.22044605e-16]
 [ 1.85037171e-17  1.00000000e+00  4.10256410e-01  1.00000000e+00]
 [ 3.33333333e-01  8.72727273e-01 -2.22044605e-16  1.46666667e-01]]
[[1.         0.         1.         0.        ]
 [0.         1.         0.41025641 1.        ]
 [0.33333333 0.87272727 0.         0.14666667]]


  del sys.path[0]


In [32]:
# help(sp.MinMaxScaler)

## 三、归一化
将每个样本的各个特征值按照一定的比例缩放到[0,1]区间，且该特征值的绝对值之和为1
* 需要归一化：基于参数的模型或基于距离的模型，都是要进行特征的归一化 
* 不需要归一化：基于树的方法是不需要进行特征的归一化，例如随机森林，bagging 和 boosting等

In [73]:
import numpy as np
import sklearn.preprocessing as sp
def normalize(raw_samples):
    nor_samples = raw_samples.copy()
    rows = nor_samples.shape[0]
    for row in range(rows):
        # 是将每个样本按比例缩放
        row_samples = nor_samples[row]
        row_abs = abs(row_samples)
        row_abs_sum = row_abs.sum()
        row_samples /= row_abs_sum
    return nor_samples

def main():
    raw_samples = np.array([
        [3,-1.5,2,-5.4],
        [0,4,-0.3,2.1],
        [1,3.3,-1.9,-4.3]
    ])
    nor_samples = normalize(raw_samples)
    print(nor_samples)
    for row in range(nor_samples.shape[0]):
        row_samples = nor_samples[row]
        abs_samples = abs(row_samples)
        sum_samples = abs_samples.sum()
        print(sum_samples)
    # l1即L1范数，矢量中各元素绝对值之和。 
    # l2即L2范数，矢量元素绝对值的平方和再开方
    nor_samples = sp.normalize(raw_samples,norm='l1')
    print(nor_samples)
    for row in range(nor_samples.shape[0]):
        row_samples = nor_samples[row]
        abs_samples = abs(row_samples)
        sum_samples = abs_samples.sum()
        print(sum_samples)
    return 0

if __name__ == '__main__':
    main()

[[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]
1.0
1.0
1.0
[[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]
1.0
1.0
1.0


## 四、二值化
根据一个预先设定的阈值，小于等于阈值统一置0，大于阈值统一置1  
** 二值化方法不可逆 **

In [43]:
import numpy as np
import sklearn.preprocessing as sp
def binarize(raw_samples,threshold):
    bin_samples = raw_samples.copy()
    bin_samples[bin_samples <= threshold] = 0
    bin_samples[bin_samples > threshold] = 1
    return bin_samples

def main():
    raw_samples = np.array([
        [3,-1.5,2,-5.4],
        [0,4,-0.3,2.1],
        [1,3.3,-1.9,-4.3]
    ])
    print(raw_samples)
    bin_samples = binarize(raw_samples,1.4)
    print(bin_samples)
    # 生成一个二值化器
    bin = sp.Binarizer(threshold=1.4)
    # 用二值化器进行转化
    bin_samples = bin.transform(raw_samples)
    print(bin_samples)
    return 0

if __name__ == '__main__':
    main()

[[ 3.  -1.5  2.  -5.4]
 [ 0.   4.  -0.3  2.1]
 [ 1.   3.3 -1.9 -4.3]]
[[1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [0. 1. 0. 0.]]
[[1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [0. 1. 0. 0.]]


## 五、独热编码
只能有一个1  
０　　　０　　　３  
１　　　１　　　０  
０　　　２　　　１  
１　　　０　　　２  
 ---    ----  ----- 
01　　012　　　0123  
2　　　3　　　　4  
0-10　0-100　　0-1000  
1-01　1-010　　1-0100  
　　　2-001　　2-0010  
　　　　　　　 3-0001  
---   ---     ---- 
编码字典：  
101000001  
010101000  
100010100  
011000010    
* 独热编码器.fit_transform(原始样本矩阵) –> return：独热编码后的样本矩阵，同时构建编码表字典，   
* 独热编码器.transform(原始样本矩阵) –> return：独热编码后的样本矩阵，使用已有的编码表字典.  

In [77]:
import sklearn.preprocessing as sp
def onehot_encode(raw_samples):
    code_tables = []
    for colume in raw_samples.T:
        code_table = {}
        for value in colume:
            code_table[value] = None
        code_tables.append(code_table)
    
    for code_table in code_tables:
        size = len(code_table)
        # enumerate生成索引和键组成元组的列表
        for one,key in enumerate(sorted(code_table.keys())):
            code_table[key] = np.zeros(shape=size,dtype=int)
            code_table[key][one] = 1
            # print(code_table)
    # print(code_tables)
    ohe_samples = []
    for raw_sample in raw_samples:
        ohe_sample = np.array([],dtype=int)
        for colume,feature in enumerate(raw_sample):
            ohe_sample = np.hstack((ohe_sample,
                                   code_tables[colume][feature]))
        ohe_samples.append(ohe_sample)
    return np.array(ohe_samples)

def main():
    raw_samples = np.array([
        [0,0,3],
        [1,1,0],
        [0,2,1],
        [1,0,2]
    ])
    print(raw_samples)
    ohe_samples = onehot_encode(raw_samples)
    print(ohe_samples)
    # 构造一个独热编码器
    ohe = sp.OneHotEncoder(sparse=False,dtype=int)
    ohe_samples = ohe.fit_transform(raw_samples)
    test_sample1 = np.array([[0,1,2]])
    test_sample2 = np.array([[0,4,5]])
    # 使用已有的编码表字典
    ohe_samples1 = ohe.transform(test_sample1)
    # 使用已构建过的独热编码字典进行编码,前提是特征中的状态必须是已有编码字典里的状态，如果存在未出现过的状态，则编码会出现错误
    # 无法编码4和5
    # ohe_samples2 = ohe.transform(test_sample2)
    print(ohe_samples)
    print(ohe_samples1)
    # print(ohe_samples2)

if __name__ == '__main__':
    main()

[[0 0 3]
 [1 1 0]
 [0 2 1]
 [1 0 2]]
[[1 0 1 0 0 0 0 0 1]
 [0 1 0 1 0 1 0 0 0]
 [1 0 0 0 1 0 1 0 0]
 [0 1 1 0 0 0 0 1 0]]
[[1 0 1 0 0 0 0 0 1]
 [0 1 0 1 0 1 0 0 0]
 [1 0 0 0 1 0 1 0 0]
 [0 1 1 0 0 0 0 1 0]]
[[1 0 0 1 0 0 0 1 0]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [54]:
code_table={'0':None,'1':None}
print(list(enumerate(sorted(code_table.keys()))))

[(0, '0'), (1, '1')]


## 六、标记编码
样本的特征值如果已经是数字，则直接使用它们，如果是字符串，则可以通过标记编码得到与之对应的唯一数字，以方便后续处理

In [72]:
import sklearn.preprocessing as sp

def main():
    raw_labels = np.array([
        ['audi','ford','audi','toyota','ford','bmw',
        'toyota','ford','audi'],
        ['Benze','audi','audi','bmw','toyota','ZH','Benze',
        'bmw','audi']
    ])
    print(raw_labels)
    # 生成标签编码器
    codec = sp.LabelEncoder()
    enc_labels = []
    codec = sp.LabelEncoder()
    for raw_label in raw_labels:
        enc_label = codec.fit_transform(raw_label)
        enc_labels.append(enc_label)
    print(enc_labels)
    # enc_labels = codec.fit_transform(raw_labels)
    print(enc_labels)
    # 用inverse_transform进行解码
    dec_labels = codec.inverse_transform(enc_labels[0])
    print(dec_labels)
    

if __name__ == '__main__':
    main()

[['audi' 'ford' 'audi' 'toyota' 'ford' 'bmw' 'toyota' 'ford' 'audi']
 ['Benze' 'audi' 'audi' 'bmw' 'toyota' 'ZH' 'Benze' 'bmw' 'audi']]
[array([0, 2, 0, 3, 2, 1, 3, 2, 0]), array([0, 2, 2, 3, 4, 1, 0, 3, 2])]
[array([0, 2, 0, 3, 2, 1, 3, 2, 0]), array([0, 2, 2, 3, 4, 1, 0, 3, 2])]
['Benze' 'audi' 'Benze' 'bmw' 'audi' 'ZH' 'bmw' 'audi' 'Benze']
