# 1. 加载数据集

In [109]:
import numpy as np
from sklearn import datasets

In [110]:
# Load the iris dataset,as a pandas DataFrame
iris = datasets.load_iris(as_frame=True)

In [111]:
# 使用pandas.DataFrame格式展示
iris.frame

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [112]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [113]:
iris.data.shape

(150, 4)

In [114]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [115]:
# X为特征数据，y为目标数据
X = iris.data.values
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [116]:
y = iris.target
y[:5],type(y)

(0    0
 1    0
 2    0
 3    0
 4    0
 Name: target, dtype: int64,
 pandas.core.series.Series)

In [117]:
iris.target.shape

(150,)

In [118]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

# 2. 拆分数据集

In [119]:
shuffle_index = np.random.permutation(len(y))
shuffle_index

array([ 54,  60, 137,  78,  18,  44,   8,  76,  66, 131, 134,  28, 125,
       139,  19,  59, 133,  63,  73,  29,  22, 110,  97, 119, 120, 108,
       135,  23,  15,  37,  90,  69,   9, 100,  85,  74,  20,  35,  86,
        70, 109,  64,  81,  25,  79,  42, 130, 103,   1,  87, 148,  48,
         5, 143,   7,  10,  27,  12, 112,  34, 140,  55,  83,  94, 117,
        40,  75,   4,  80,  41,  38,  14, 124, 142, 118,  65,  98, 116,
       136,  77, 105, 113, 123,  45,  52,  50, 126,  56,  46,  92,  33,
        91,  68,  31, 107, 102, 127,  96,  99,  39, 121,  13,  51, 115,
       144,  67,  17,  32,  89,  62, 106,  36,  53,  43,  11, 132,  88,
        84,  72,  57,   0, 145,  26,  16, 104,  58,   6, 101, 149,  71,
        93,  95,  61, 129, 138,   3,  24,   2, 146, 147, 114, 141, 128,
        21,  49,  82, 122,  30, 111,  47])

In [120]:
# 训练数据比例
train_ratio = 0.8

In [121]:
train_size = int(len(X)*train_ratio)
train_size

120

In [122]:
train_index = shuffle_index[:train_size]
test_index = shuffle_index[train_size:]

In [123]:
# 生成训练数据和测试数据
X_train = X[train_index]
y_train = y[train_index]

X_test = X[test_index]
y_test = y[test_index]
y_test[:5]

0      0
145    2
26     0
16     0
104    2
Name: target, dtype: int64

In [124]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [125]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [126]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=666)

In [127]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [128]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


# 3. 预测

In [129]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [130]:
knn_classifier.fit(X_train, y_train)

In [131]:
y_predict = knn_classifier.predict(X_test)
y_predict

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0,
       2, 0, 1, 1, 0, 1, 2, 2])

# 4. 评价

In [139]:
y_predict[:5],y_test[:5]

(array([1, 2, 1, 2, 0]),
 66     1
 114    2
 93     1
 101    2
 3      0
 Name: target, dtype: int64)

In [140]:
y_test.values[:5]

array([1, 2, 1, 2, 0])

In [141]:
# y_predict和y_test.values都是numpy.ndarray类型,所以可以比较(numpy的广播机制)
type(y_predict),type(y_test),type(y_test.values)

(numpy.ndarray, pandas.core.series.Series, numpy.ndarray)

In [133]:
np.sum(y_predict == y_test)

np.int64(30)

In [134]:
accutacy = np.sum(y_predict == y_test) / len(y_test)
accutacy

np.float64(1.0)

In [None]:
# 判断2个数据集的准确率(相似度)
from sklearn.metrics import accuracy_score

In [136]:
accuracy_score(y_test,y_predict)

1.0