$\def\*#1{\mathbf{#1}}$
$\DeclareMathOperator*{\argmax}{arg\,max}$

# Distances Methods

In [None]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.model_selection import train_test_split
import sklearn.datasets as datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

## Data Matrix

* Each row is a **record** (instance, object, point,...).
* Each column is a **feature** (attribute, dimension,...).

$$
D = 
\left(
\begin{array}{c|cccc}
        & X_1 & X_2 & \cdots & X_d\\
        \hline
  \*x_1 & x_{1,1} & x_{1,2} & \cdots & x_{1,d} \\
  \*x_2 & x_{2,1} & x_{2,2} & \cdots & x_{2,d} \\
  \vdots & \vdots  & \vdots  & \ddots & \vdots  \\
  \*x_n & x_{n,1} & x_{n,2} & \cdots & x_{n,d} 
\end{array}
\right)
$$

* Each row is a point in a $d$-dimensional geometric space.


## Measuring distance

The well known euclidean distance is defined as follows :

$$d(p, q) = \sqrt{\sum_{i=1}^d(p_i - q_i)^2}$$

In [None]:
fig, ax = plt.subplots()
plt.prism() # set the default colormap to prism

fig.suptitle('Euclidean distances from (0, 0)')

x = np.linspace(-10, 10, 100)
y = np.linspace(-10, 10, 100)

xv, yv  = np.meshgrid(x, y)

l2_norm = np.sqrt(xv**2 + yv**2)

controur_set = ax.contour(xv, yv, l2_norm, levels=np.arange(11))
ax.clabel(controur_set)

## Distance Metrics

A *metric* satisties the following properties :

* *Positivity* : $d(x, y) \geqslant 0$, for all $x$ and $y$.
* *Identity* : $d(x, y) = 0$, if and only if $x = y$.
* *Symmetry* : $d(x, y) = d(y, x)$, for all $x$ and $y$.
* *Triangle inequality* : $d(x, y) \leqslant d(x, z) + d(z, y)$ for all $x$, $y$, and $z$.

## The $L_k$ Distance Metric

$$d_k(p, q) \Big(\sum_{i=1}^d\big|p_i - q_i\big|^k\Big)^{\frac{1}{k}}$$

* $k = 1$ : Manhattan distance
* $k = 2$ : Euclidean distance
* $k = \infty$ : Maximum component

In [None]:
fig, ax = plt.subplots()

fig.suptitle('Manhattan distances from (0, 0)')

x = np.linspace(-10, 10, 100)
y = np.linspace(-10, 10, 100)

xv, yv  = np.meshgrid(x, y)

l1_norm = np.abs(xv) + np.abs(yv)

controur_set = ax.contour(xv, yv, l1_norm, levels=np.arange(11))
ax.clabel(controur_set)

In [None]:
fig, ax = plt.subplots()

fig.suptitle('Maximum component from (0, 0)')

x = np.linspace(-10, 10, 100)
y = np.linspace(-10, 10, 100)

xv, yv  = np.meshgrid(x, y)

linfty_norm = np.maximum(np.abs(xv), np.abs(yv))

controur_set = ax.contour(xv, yv, linfty_norm, levels=np.arange(11))
ax.clabel(controur_set)

## Nearest Neighbor Classification

#### Generate data points forming two different circles

In [None]:
X, y = datasets.make_circles(n_samples=100, noise=.1, factor=.5)
print("X.shape:", X.shape)
print("unique labels: ", np.unique(y))

#### Plot the data

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1], c=y)

#### Consider a new point

In [None]:
p = np.array([0, 0.5])

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1], c=y)
ax.scatter(p[0], p[1], marker='^', color='blue')

#### Find the nearest neighbor

In [None]:
l2 = np.sqrt(np.sum((X-p)**2, axis=1))

In [None]:
fig, ax = plt.subplots()
positive = y == 1
negative = y == 0
ax.scatter(X[positive, 0], X[positive, 1], c=l2[positive], cmap='hot', marker='+')
ax.scatter(X[negative, 0], X[negative, 1], c=l2[negative], cmap='hot')
ax.scatter(p[0], p[1], marker='^', color='blue')

In [None]:
nearest_neighbor = np.argmin(l2)
nearest_neighbor

In [None]:
others = np.arange(X.shape[0])
others = np.setdiff1d(others, nearest_neighbor)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[others, 0], X[others, 1], c='black')
ax.scatter(p[0], p[1], marker='^', color='blue')
ax.scatter(X[nearest_neighbor, 0], X[nearest_neighbor, 1], c=y[nearest_neighbor], marker='+')

## $k$-Nearest Neighbors

#### Find the 30 nearest neighbors

In [None]:
k_nearest_neighbors = np.argsort(l2)[:30]
k_nearest_neighbors

In [None]:
others = np.arange(X.shape[0])
others = np.setdiff1d(others, k_nearest_neighbors)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[others, 0], X[others, 1], c='black')
ax.scatter(p[0], p[1], marker='^', color='blue')
ax.scatter(X[k_nearest_neighbors, 0], X[k_nearest_neighbors, 1], c=y[k_nearest_neighbors], marker='+')

#### Split the data set : training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#### Classify the points (in the two classes) using a k-nn classifier:

In [None]:
# we specify that this knn should always use 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)
print("KNN validation accuracy on training set: ", knn.score(X_train, y_train), '\n')

y_pred_test = knn.predict(X_test)
print("KNN validation accuracy on test set: ", knn.score(X_test, y_test), '\n')

C = confusion_matrix(y_true=y_test, y_pred=y_pred_test)

print("Confusion matrix: \n\n", C)

#### Nearest-neighbors classifiers lead to decision boundaries

In [None]:
x = np.linspace(-1, 1)
y = np.linspace(-1, 1)
xv, yv = np.meshgrid(x, y)
X = np.stack([xv.flatten(), yv.flatten()], axis=1)
classes = knn.predict(X)

fig, ax = plt.subplots()
ax.pcolormesh(xv, yv, classes.reshape(xv.shape))