# 1. 最大最小值归一化 

In [123]:
import numpy as np
from sklearn import datasets

In [124]:
# 加载鸢尾花数据集
iris = datasets.load_iris()

In [125]:
# 获取特征矩阵X和标签向量y
X = iris.data  # 特征矩阵,包含4个特征:花萼长度、花萼宽度、花瓣长度、花瓣宽度
y = iris.target  # 标签向量,0代表setosa,1代表versicolor,2代表virginica

In [126]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [127]:
# 查看第一个特征(第一列)的最大值和最小值
np.max(X[:,0]),np.min(X[:,0])

(np.float64(7.9), np.float64(4.3))

In [128]:
# 对第一个特征进行最小最大值归一化
# 公式: x = (x - min) / (max - min)
X[:,0] = (X[:,0] - np.min(X[:,0])) /  (np.max(X[:,0]) - np.min(X[:,0]))

In [129]:
X[:5,0]

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444])

In [130]:
# 对其他三个特征进行最小最大值归一化
X[:,1] = (X[:,1] - np.min(X[:,1])) /  (np.max(X[:,1]) - np.min(X[:,1]))
X[:,2] = (X[:,2] - np.min(X[:,2])) /  (np.max(X[:,2]) - np.min(X[:,2]))
X[:,3] = (X[:,3] - np.min(X[:,3])) /  (np.max(X[:,3]) - np.min(X[:,3]))

In [131]:
X[:5]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

# 2. 零均值归一化

In [132]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [133]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [134]:
# 查看第一个特征的均值和标准差
np.mean(X[:,0]),np.std(X[:,0])

(np.float64(5.843333333333334), np.float64(0.8253012917851409))

In [135]:
# 对第一个特征进行零均值归一化
# 公式: x = (x - mean) / std
X[:,0] = (X[:,0] - np.mean(X[:,0]))/np.std(X[:,0])

In [136]:
X[:5,0]

array([-0.90068117, -1.14301691, -1.38535265, -1.50652052, -1.02184904])

In [137]:
# 验证归一化后的均值和标准差
np.mean(X[:,0])  # 接近0

np.float64(-4.736951571734001e-16)

In [138]:
np.std(X[:,0])   # 接近1

np.float64(1.0)

In [139]:
# 对其他三个特征进行零均值归一化
X[:,1] = (X[:,1] - np.mean(X[:,1]))/np.std(X[:,1])
X[:,2] = (X[:,2] - np.mean(X[:,2]))/np.std(X[:,2])
X[:,3] = (X[:,3] - np.mean(X[:,3]))/np.std(X[:,3])

In [140]:
X[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

# 3. scikit-learn 中的StandardScaler

In [141]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [142]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [143]:
from sklearn.preprocessing import StandardScaler

In [144]:
standard_scaler = StandardScaler()

In [145]:
# 训练StandardScaler(计算训练数据的均值和标准差)
standard_scaler.fit(X)

In [146]:
# 查看各个特征的均值
standard_scaler.mean_

array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [147]:
# 查看各个特征的标准差
standard_scaler.scale_

array([0.82530129, 0.43441097, 1.75940407, 0.75969263])

In [148]:
# 对数据进行转换,即归一化
X = standard_scaler.transform(X)

In [149]:
X[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

# 4. 使用归一化

In [150]:
# 使用train_test_split分割数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,train_size=0.8,random_state=666)

In [151]:
from sklearn.preprocessing import StandardScaler

In [152]:
standard_scaler = StandardScaler()

In [153]:
# 使用训练数据的均值和标准差
# 后续训练数据和测试数据都会使用训练数据的均值和标准差进行归一化
standard_scaler.fit(X_train)

In [154]:
standard_scaler.mean_

array([5.83416667, 3.08666667, 3.70833333, 1.17      ])

In [155]:
standard_scaler.scale_

array([0.81019502, 0.44327067, 1.76401924, 0.75317107])

In [156]:
# 对训练数据和测试数据进行归一化
X_train_standard = standard_scaler.transform(X_train)

In [157]:
X_test_standard = standard_scaler.transform(X_test)

In [158]:
from sklearn.neighbors import KNeighborsClassifier

In [159]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [160]:
knn_classifier.fit(X_train_standard,y_train)

In [161]:
# 在测试集上评估模型性能
knn_classifier.score(X_test_standard, y_test)

1.0