## 引入超参数 - 2点之间的距离

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

digits = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=666)

best_distance = ''
best_score =  0.0
best_k = -1

for method in ['uniform', 'distance']:
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors = k, weights = method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_method = method
            
print('Best method: ' + best_method)
print('Best k:', best_k)
print('Best score: ', best_score)

## 距离种类 - 欧拉距离，曼哈顿距离，明可夫斯基距离 - Minkowski distance

In [24]:
%%time

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

digits = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=666)

best_p = 1
best_score =  0.0
best_k = -1

for _p in range(1, 6):
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors = k, weights = 'distance', p = _p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_p = _p
            
print('Best p: ', best_p)
print('Best k:', best_k)
print('Best score: ', best_score)

Best p:  2
Best k: 3
Best score:  0.9916666666666667
CPU times: total: 55.4 s
Wall time: 10.7 s


## Grid search 网格搜索

In [25]:
knn_clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 5,
 'weights': 'distance'}

In [26]:
X = np.array([[1, 2], [3, 4], [5, 6]])
Y = np.array([[7, 8], [9, 10]])



In [27]:
X

array([[1, 2],
       [3, 4],
       [5, 6]])

In [28]:
Y

array([[ 7,  8],
       [ 9, 10]])

In [29]:
X*Y.T

ValueError: operands could not be broadcast together with shapes (3,2) (2,2) 

In [30]:
Y.T

array([[ 7,  9],
       [ 8, 10]])

In [31]:
X*(Y.T)

ValueError: operands could not be broadcast together with shapes (3,2) (2,2) 

In [32]:
np.dot(X, Y.T)
    

array([[ 23,  29],
       [ 53,  67],
       [ 83, 105]])

In [35]:
from math import sqrt
result = np.dot(X, Y.T)

In [36]:
result

array([[ 23,  29],
       [ 53,  67],
       [ 83, 105]])

In [37]:
sqrt(23)

4.795831523312719

In [39]:
from sklearn.metrics import DistanceMetric
dist = DistanceMetric.get_metric('manhattan')
dist.pairwise(X, Y)

array([[12., 16.],
       [ 8., 12.],
       [ 4.,  8.]])

In [40]:
X = [[1, 2], [3, 4], [5, 6]]
Y = [[7, 8], [9, 10]]
dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(X, Y)


array([[ 8.48528137, 11.3137085 ],
       [ 5.65685425,  8.48528137],
       [ 2.82842712,  5.65685425]])

In [44]:
sqrt(128)




11.313708498984761

In [46]:
for y in Y:
    for x in X:
        print(sqrt((y[0] - x[0])**2 + (y[1] - x[1])**2))


8.48528137423857
5.656854249492381
2.8284271247461903
11.313708498984761
8.48528137423857
5.656854249492381
