In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits

In [2]:
digits=load_digits()

In [3]:
X=digits.data
y=digits.target

In [4]:
X.shape

(1797, 64)

In [5]:
y.shape

(1797,)

## 测试集合和训练集合

In [6]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.4, random_state=666)

In [13]:
X_train.shape

(1078, 64)

In [14]:
X_test.shape

(719, 64)

## KNN识别手写

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
best_score, best_p, best_k=0.0, 0, 0

for k in range(2, 10):
    for p in range(1,6):
        knn_clf=KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        
        if score>best_score:
            best_score,best_p,best_k=score,p,k
print("best score:{}, best p:{}, best k:{}".format(best_score, best_p, best_k))

best score:0.9860917941585535, best p:4, best k:3


## 使用交叉验证

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
knn_clf2=KNeighborsClassifier()

cross_val_score(knn_clf2, X_train, y_train)

array([ 0.98895028,  0.97777778,  0.96629213])

In [18]:
best_score, best_p, best_k=0.0, 0, 0

for k in range(2, 10):
    for p in range(1,6):
        knn_clf=KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        scores=cross_val_score(knn_clf, X_train, y_train)
        score = np.mean(scores)
        
        if score>best_score:
            best_score,best_p,best_k=score,p,k
print("best score:{}, best p:{}, best k:{}".format(best_score, best_p, best_k))

best score:0.9823599874006478, best p:2, best k:2


## 可以看出，交叉验证和train_test_split的结果不一样。并且交叉验证的score会低一些。
## 这里，我们只是想获得对应的参数k和p，并以此构建最佳模型

In [19]:
best_knn_clf=KNeighborsClassifier(weights="distance", n_neighbors=2, p=2)

In [20]:
best_knn_clf.fit(X_train, y_train)
best_knn_clf.score(X_test, y_test)

0.98052851182197498

## 可以看出：以3交叉验证，得到模型参数k=2，p=2的模型，在测试数据上的R2分数是0.98

## 回顾网格搜索

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
param_grid=[{
    'weights':['distance'],
    'n_neighbors':[i for i in range(2,10)],
    'p':[i for i in range(1,6)]
}]
knn_clf=KNeighborsClassifier()
grid=GridSearchCV(knn_clf, param_grid, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'p': [1, 2, 3, 4, 5], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9], 'weights': ['distance']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

## 上述的意思是：40个模型模型组合参数，每个模型，都进行3交叉验证。因此，是120次fit

In [23]:
grid.best_score_

0.98237476808905377

In [24]:
grid.best_params_

{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}

In [26]:
best_knn_clf=grid.best_estimator_

#best_knn_clf.fit(X_train, y_train)
best_knn_clf.score(X_test, y_test)

0.98052851182197498

In [27]:
cross_val_score(knn_clf, X_train, y_train, cv=5)

array([ 0.99543379,  0.97716895,  0.97685185,  0.98130841,  0.97142857])