1.导入库

In [10]:
import pandas as pd
import numpy as np

2.导入数据包

In [11]:
dataset = pd.read_csv('../datasets/Data.csv')
# 选择所有行，除了最后一列的所有列
X = dataset.iloc[:, :-1].values
# 选择所有行，最后一列
y = dataset.iloc[:, -1].values
print("Dataset :\n"+str(dataset))
print("X :\n"+str(X))
print("y :\n"+str(y))


Dataset :
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
X :
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
y :
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


3. 处理缺失数据

In [12]:
#导入Imputer 类，用于处理数据中的缺失值。
from sklearn.impute import SimpleImputer
# 创建一个SimpleImputer对象，指定缺失值为np.nan，使用均值策略来填补缺失值
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")
# 适配Imputer对象到数据集的第2和第3列（索引1和2），计算这些列的均值。
imputer = imputer.fit(X[ : , 1:3])
# 用Imputer对象转换后的结果替换原始数据集中的缺失值。
X[ : , 1:3] = imputer.transform(X[ : , 1:3])
print("X :\n"+str(X))

X :
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


4. 编码分类数据

In [13]:
#导入分类数据编码的库，用于将分类变量转换为数值变量。
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
# 对数据集的第一列（索引为0）进行标签编码。按字母顺序排序。
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
print("X :\n"+str(X))

X :
[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


In [14]:
#导入OneHotEncoder类，用于将分类变量转换为独热编码形式。
from sklearn.preprocessing import OneHotEncoder
# 导入ColumnTransformer类，用于将不同的转换应用于数据集的不同列。
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# 对数据集的第一列（索引为0）进行独热编码。
X = np.array(ct.fit_transform(X))
print("X :\n"+str(X))
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(y)
print("Y :\n"+str(Y))

X :
[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
Y :
[0 1 0 0 1 1 0 1 0 1]


5. 拆分数据集为训练集和测试集

In [17]:
# 导入train_test_split函数，用于将数据集划分为训练集和测试集。
from sklearn.model_selection import train_test_split
# 将数据集划分为训练集和测试集，测试集占20%，随机种子设为0以确保结果可复现。
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)
print("X_train :\n"+str(X_train))
print("X_test :\n"+str(X_test))
print("Y_train :\n"+str(Y_train))
print("Y_test :\n"+str(Y_test))

X_train :
[[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
X_test :
[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]]
Y_train :
[1 1 1 0 1 0 0 1]
Y_test :
[0 0]


6. 特征缩放

In [18]:
#导入StandardScaler类，用于对特征进行标准化处理。
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
# 对训练集和测试集的特征进行标准化处理。
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

print("X_train :\n"+str(X_train))
print("X_test :\n"+str(X_test))


X_train :
[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
X_test :
[[ 0.  0.  0. -1. -1.]
 [ 0.  0.  0.  1.  1.]]
