### First step:导入库

In [96]:
import numpy as np 
import pandas as pd

### Second step:读取数据集Data.csc

In [97]:
#typedict = {'Age':pd.Int64Dtype(),'Salary':pd.Int64Dtype()}     # 在列包含空值时，pandas无法读取为int类型，我们可以指定为Int64Dtype
datasets = pd.read_csv('Data.csv')   # 该数据集最后想要学得的模型是预测购买情况
#print(datasets.iloc[0])             # dataframe.iloc[index]返回索引为index行的数据
#print(datasets.iloc[0].values)      # .values返回给定dataframe的numpy形式
X = datasets.iloc[:,:-1].values      # X代表数据集中各属性的取值，如城市，薪水，年龄等
Y = datasets.iloc[:,-1].values       # Y代表数据集中标签：如是否购买

In [98]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Third step：缺失值处理

In [99]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')    # median代表中位数
imp_mean.fit(X[:,1:3])
X[:,1:3] = imp_mean.transform(X[:,1:3])
X[:,1:3] = X[:,1:3].astype(int)                                    # 将年龄与薪水转换为int类型
print(X)

[['France' 44 72000]
 ['Spain' 27 48000]
 ['Germany' 30 54000]
 ['Spain' 38 61000]
 ['Germany' 40 63777]
 ['France' 35 58000]
 ['Spain' 38 52000]
 ['France' 48 79000]
 ['Germany' 50 83000]
 ['France' 37 67000]]


### fourth step:编码分类数据

In [100]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder = LabelEncoder()
labelencoder.fit(X[:,0])
X[:,0] = labelencoder.transform(X[:,0])
labelencoder.fit(Y)
Y = labelencoder.transform(Y)
print(X)

[[0 44 72000]
 [2 27 48000]
 [1 30 54000]
 [2 38 61000]
 [1 40 63777]
 [0 35 58000]
 [2 38 52000]
 [0 48 79000]
 [1 50 83000]
 [0 37 67000]]


In [101]:
# 利用OnehotEnconder将属性值转化为对应维数的值

ct = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)
# onehotencoder = OneHotEncoder(categorical_features = [0])     # categorical_features参数已经取消
# X = onehotencoder.fit_transform(X).toarray()
print(X)

[[1.0 0.0 0.0 44 72000]
 [0.0 0.0 1.0 27 48000]
 [0.0 1.0 0.0 30 54000]
 [0.0 0.0 1.0 38 61000]
 [0.0 1.0 0.0 40 63777]
 [1.0 0.0 0.0 35 58000]
 [0.0 0.0 1.0 38 52000]
 [1.0 0.0 0.0 48 79000]
 [0.0 1.0 0.0 50 83000]
 [1.0 0.0 0.0 37 67000]]


### Fifth step:划分数据集

In [105]:
from sklearn.model_selection import train_test_split
# test_size代表测试集的比例，random_state代表随机种子,保证每次产生的划分是相同的
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train,X_test,Y_train,Y_test)

[[0.0 1.0 0.0 40 63777]
 [1.0 0.0 0.0 37 67000]
 [0.0 0.0 1.0 27 48000]
 [0.0 0.0 1.0 38 52000]
 [1.0 0.0 0.0 48 79000]
 [0.0 0.0 1.0 38 61000]
 [1.0 0.0 0.0 44 72000]
 [1.0 0.0 0.0 35 58000]] [[0.0 1.0 0.0 30 54000]
 [0.0 1.0 0.0 50 83000]] [1 1 1 0 1 0 0 1] [0 0]


### Sixth step:特征缩放

In [106]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
print(X_train)
print(X_test)

[[-1.          2.64575131 -0.77459667  0.27978024  0.12374357]
 [ 1.         -0.37796447 -0.77459667 -0.23673712  0.4617671 ]
 [-1.         -0.37796447  1.29099445 -1.95846165 -1.53092514]
 [-1.         -0.37796447  1.29099445 -0.06456467 -1.11141099]
 [ 1.         -0.37796447 -0.77459667  1.65715986  1.72030956]
 [-1.         -0.37796447  1.29099445 -0.06456467 -0.16750414]
 [ 1.         -0.37796447 -0.77459667  0.96847005  0.98615979]
 [ 1.         -0.37796447 -0.77459667 -0.58108203 -0.48213975]]
[[ 0.  0.  0. -1. -1.]
 [ 0.  0.  0.  1.  1.]]
