# Tính khoảng cách

## Setup

In [1]:
import numpy as np
from time import time

## Khoảng cách từ một điểm tới từng điểm trong một tập hợp

In [7]:
d = 1000
N = 10000
X = np.random.randn(N, d)
z = np.random.randn(d)

def dist_pp(z, x):
    return np.sum((z - x)**2)

def dist_ps_naive(z, X):
    res = np.zeros((X.shape[0]))
    for i in range(X.shape[0]):
        res[i] = dist_pp(z, X[i])
    return res

def dist_ps_fast(z, X):
    X2 = np.sum(X*X, axis=1)
    z2 = np.sum(z*z)
    return X2 + z2 - 2*X.dot(z)

t1 = time()
D1 = dist_ps_naive(z, X)
print('Naive point2set, running time:', time() - t1, 's')

t2 = time()
D2 = dist_ps_fast(z, X)
print('Fast point2set, running time:', time() - t2, 's')
print('Result difference:', np.linalg.norm(D1 - D2))

Naive point2set, running time: 0.12997221946716309 s
Fast point2set, running time: 0.05001544952392578 s
Result difference: 2.2936583651096028e-11


## Khoảng cách giữa từng cặp điểm trong hai tập hợp

In [11]:
Z = np.random.randn(100, d)

def dist_ss_half_fast(Z, X):
    res = np.zeros((Z.shape[0], X.shape[0]))
    for i in range(Z.shape[0]):
        res[i] = dist_ps_fast(Z[i], X)
    return res

def dist_ss_fast(Z, X):
    X2 = np.sum(X*X, axis=1)
    Z2 = np.sum(Z*Z, axis=1)
    return Z2.reshape(-1, 1) + X2.reshape(1, -1) - 2*Z.dot(X.T)

t3 = time()
D3 = dist_ss_half_fast(Z, X)
print('Half fast set2set running time:', time() - t3, 's')

t4 = time()
D4 = dist_ss_fast(Z, X)
print('Fast set2set running time:', time() - t4, 's')
print('Result difference:', np.linalg.norm(D3 - D4))

Half fast set2set running time: 6.878211259841919 s
Fast set2set running time: 0.17335176467895508 s
Result difference: 9.597350996352788e-11


# Iris

In [23]:
import numpy as np
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
np.random.seed(7)

iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target

X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=130)

# 1NN
model = neighbors.KNeighborsClassifier(n_neighbors=1, p=2) # p=2: l2 norm
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 1NN:', 100*accuracy_score(y_test, y_pred), '%')

# 7NN
model = neighbors.KNeighborsClassifier(n_neighbors=7, p=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN:', 100*accuracy_score(y_test, y_pred), '%')

# 7NN & weights
model = neighbors.KNeighborsClassifier(n_neighbors=7, p=2, weights='distance')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN (1/distance weights):', 100*accuracy_score(y_test, y_pred), '%')

def myweights(distances):
    sigma2 = 0.4
    return np.exp(-distances**2/sigma2)

# 7NN & customized weights
model = neighbors.KNeighborsClassifier(n_neighbors=7, p=2, weights=myweights)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN (customized weights):', 100*accuracy_score(y_test, y_pred), '%')

Accuracy of 1NN: 92.3076923076923 %
Accuracy of 7NN: 93.84615384615384 %
Accuracy of 7NN (1/distance weights): 94.61538461538461 %
Accuracy of 7NN (customized weights): 95.38461538461539 %
