### 测试我们的算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target 

In [4]:
X.shape

(150, 4)

In [5]:
y.shape

(150,)

### train_test_split

In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# 只对索引进行一个乱序处理
# np.random.permutation(100)就是求的 0-99 之间的一个随机排列
# permutation是排列的意思
shuffle_indexes = np.random.permutation(len(X))

In [8]:
shuffle_indexes

array([125,  77,  65,  49,  85,  56, 146,  91,  64,  58, 121, 109,  80,
        41, 103, 137, 127,  96, 113,  54,  93,  69, 126,  48,  51,  35,
        26,  99,  78,  14,  63,  31, 110,   3,   8, 123,  15, 147, 100,
        53, 133, 107, 106,  30,  60,  18, 102,  20,  59,  75,  70,  25,
       120,  95,   5, 144,  97,  28, 128,  73,  43, 141, 138,   2,  66,
        21,  23,  92,  10,  33,  52,  16, 115, 135, 122, 101,  71,  32,
        67,  61,  44,  19, 149,   9,  22,  84,  24,  47,  82,  42, 118,
         0, 108,  46, 142,   4, 117,  11, 119, 139,  12,  57,  72,  27,
        40,  68,  76, 105, 124, 148,  37, 140,  29,  36, 111,  34,  94,
       129,  81, 112,  74,  39,  86,  90,   1,  55, 132, 104,  45,  83,
       116, 134,  79,  89,   6,  50, 136,   7,  87, 145, 131, 130,  17,
        62,  38,  98,  13,  88, 143, 114])

In [9]:
# ratio 是比率的意思
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [10]:
test_size

30

In [11]:
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

In [12]:
# 使用 Fancy Indexing 方式来获取训练数据集和测试数据集
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]

In [13]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [14]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 使用我们的算法

In [15]:
from playML.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [30]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [31]:
from playML.kNN import KNNClassifier

In [32]:
my_knn_clf = KNNClassifier(k = 3)

In [33]:
my_knn_clf.fit(X_train, y_train)

playML(K = 3)

In [34]:
y_predict = my_knn_clf.predict(X_test)

In [35]:
y_predict

array([1, 1, 1, 0, 2, 2, 0, 1, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 2, 1, 1, 2,
       2, 0, 2, 0, 0, 1, 0, 0])

In [36]:
y_predict.shape

(30,)

In [37]:
y_test

array([1, 1, 1, 0, 2, 2, 0, 1, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 2, 1, 1, 2,
       2, 0, 2, 0, 0, 1, 0, 0])

In [38]:
# 看 y_predict == y_test 这个向量中有多少个 True
sum(y_predict == y_test)

30

In [40]:
sum(y_predict == y_test) / len(y_test)

1.0

### sklearn 中的 train_test_split

In [41]:
from sklearn.model_selection import train_test_split

In [43]:
# test_size 如果不传任何值的话，默认的值就是 0.2
# 设置随机的种子，在 sklearn的train_test_split中使用 random_state 表示随机的种子
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [45]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [46]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)
