In [2]:
from __future__ import print_function
import numpy as np
from time import time # for comparing runing time
d, N = 1000, 10000 # dimension, number of training points
X = np.random.randn(N, d) # N d-dimensional points
z = np.random.randn(d)
# naively compute square distance between two vector
def dist_pp(z, x):
 d = z - x.reshape(z.shape) # force x and z to have the same dims
 return np.sum(d*d)

# from one point to each point in a set, naive
def dist_ps_naive(z, X):
 N = X.shape[0]
 res = np.zeros((1, N))
 for i in range(N):
  res[0][i] = dist_pp(z, X[i])
 return res

# from one point to each point in a set, fast
def dist_ps_fast(z, X):
 X2 = np.sum(X*X, 1) # square of l2 norm of each ROW of X
 z2 = np.sum(z*z) # square of l2 norm of z
 return X2 + z2 - 2*X.dot(z) # z2 can be ignored

t1 = time()
D1 = dist_ps_naive(z, X)
print('naive point2set, running time:', time() - t1, 's')
t1 = time()
D2 = dist_ps_fast(z, X)
print('fast point2set , running time:', time() - t1, 's')
print('Result difference:', np.linalg.norm(D1 - D2))

naive point2set, running time: 0.11594009399414062 s
fast point2set , running time: 0.1159212589263916 s
Result difference: 1.7371408325665523e-11


In [4]:
M = 100
Z = np.random.randn(M, d)
# from each point in one set to each point in another set, half fast
def dist_ss_0(Z, X):
 M = Z.shape[0]
 N = X.shape[0]
 res = np.zeros((M, N))
 for i in range(M):
  res[i] = dist_ps_fast(Z[i], X)
 return res

# from each point in one set to each point in another set, fast
def dist_ss_fast(Z, X):
 X2 = np.sum(X*X, 1) # square of l2 norm of each ROW of X
 Z2 = np.sum(Z*Z, 1) # square of l2 norm of each ROW of Z
 return Z2.reshape(-1, 1) + X2.reshape(1, -1) - 2*Z.dot(X.T)

t1 = time()
D3 = dist_ss_0(Z, X)
print('half fast set2set running time:', time() - t1, 's')
t1 = time()
D4 = dist_ss_fast(Z, X)
print('fast set2set running time', time() - t1, 's')

half fast set2set running time: 7.072688341140747 s
fast set2set running time 0.20053696632385254 s


In [15]:
from __future__ import print_function
import numpy as np
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split # for splitting data
from sklearn.metrics import accuracy_score # for evaluating results
np.random.seed(7)
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print('Labels:', np.unique(iris_y))
# split train and test
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=130)
print('Train size:', X_train.shape[0], ', test size:', X_test.shape[0])
model = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy of 1NN: %.2f %%" %(100 * accuracy_score(y_test, y_pred)))
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy of 7NN with major voting: %.2f %%" % (100 * accuracy_score(y_test, y_pred)))
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2, weights = 'distance')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy of 7NN (1 / distance weights): %.2f %%" % (100 * accuracy_score(y_test, y_pred)))

def myweight(distances):
 sigma2 = .4 # we can change this number
 return np.exp(-distances**2/sigma2)

model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2, weights = myweight)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy of 7NN (customized weights): %.2f %%" % (100 * accuracy_score(y_test, y_pred)))

Labels: [0 1 2]
Train size: 20 , test size: 130
Accuracy of 1NN: 92.31 %
Accuracy of 7NN with major voting: 93.85 %
Accuracy of 7NN (1 / distance weights): 94.62 %
Accuracy of 7NN (customized weights): 95.38 %
