## 测试我们的算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets 

In [2]:
iris_data = datasets.load_iris()
iris_data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [3]:
X = iris_data.data

In [4]:
X.shape

(150, 4)

In [5]:
y = iris_data.target

In [6]:
y.shape

(150,)

### train_test_split

如果我们使用X来训练我们的kNN算法，当k=1时，我们的算法在X上是100%正确的。

解决方案：分离出一部分数据做训练，另外一部分数据做测试。

In [7]:
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([ 80,  75,  49,  20,  99,  22,  86,  42, 130, 118,  91,  77, 120,
        98,  87,  34, 136, 147, 138, 121,  93,  74, 133,  23, 140,   1,
         3,  66,  30,  71,  18, 131, 101,  37, 143,  11, 149,  45,  32,
        81,  27,  53, 103,  38, 125,  41,  69,  90,   9,  68,  56, 141,
        50,  92, 126, 107,  65,  17,  52,  63, 139, 110,  26, 148,  54,
        95,  43,  72,  40, 127, 119,  82, 108, 112,  55,  35,  70, 122,
       116,   5,   7,  94,  21,  58,   2,  79,  61, 124, 113, 137,  39,
       144,  12,  36, 109, 111, 146,  24, 114, 135, 145, 132,   4,  28,
        67,   0,  85,   6,  48, 129,  96,  29,  25,  47,  62, 123,  31,
        44,   8,  64,  60,  46,  89, 115,  19,  13,  88, 128,  83,  57,
        59,  78,  10,  76, 117,  14, 100, 106,  15,  73,  33,  84, 134,
        51, 104, 105,  97,  16, 102, 142])

In [8]:
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [9]:
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]

In [10]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [11]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [12]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


#### 封装

In [13]:
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [15]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


#### seed 的作用

In [16]:
X_train[:5]

array([[ 4.6,  3.1,  1.5,  0.2],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.2,  3.5,  1.5,  0.2]])

In [17]:
X_train, y_train, X_test, y_test = train_test_split(X, y)
X_train[:5]

array([[ 6.3,  2.9,  5.6,  1.8],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 6.7,  3.3,  5.7,  2.5]])

In [18]:
X_train, y_train, X_test, y_test = train_test_split(X, y, seed=666)
X_train[:5]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.1,  2.5,  3. ,  1.1]])

In [19]:
X_train, y_train, X_test, y_test = train_test_split(X, y, seed=666)
X_train[:5]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.1,  2.5,  3. ,  1.1]])

#### sklearn中的train_test_split

In [20]:
from sklearn.model_selection import train_test_split

train_test_split

<function sklearn.model_selection._split.train_test_split>

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=666)
X_train[:5]

array([[ 5.7,  3.8,  1.7,  0.3],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 5. ,  3.6,  1.4,  0.2]])

#### 测试我们的算法

In [47]:
print(X_train.shape)
print(y_train.shape)

(75, 4)
(75,)


In [48]:
from playML.kNN import KNNClassifier

my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)

In [49]:
y_predict

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2,
       0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 2, 1, 1, 2, 2, 0, 2, 2, 2, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 2, 1, 1, 1, 0, 0, 2, 2, 1, 2, 0, 2, 1, 0, 1, 0, 0, 2,
       2, 2, 1, 1, 2, 2])

In [50]:
y_test

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2,
       0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 2, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 2, 1, 1, 1, 0, 0, 2, 2, 1, 2, 0, 1, 1, 0, 1, 0, 0, 2,
       2, 2, 1, 1, 2, 2])

In [53]:
sum([1 if y_predict[i] == y_test[i] else 0 for i in range(len(y_test))])

73

In [54]:
sum([1 if y_predict[i] == y_test[i] else 0 for i in range(len(y_test))]) / len(y_test)

0.9733333333333334