### Similarity and Nearest Neighbor (NN)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
iris = sns.load_dataset('iris')
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


### Similarity

***The distance between two vectors is a measure of how similar are the corresponding data points.***
- this is the basis for the nearest neighbors based algorithms

 - let's pick two datapoints from the iris dataset

In [3]:
iris.iloc[[15, 106]]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
15,5.7,4.4,1.5,0.4,setosa
106,4.9,2.5,4.5,1.7,virginica


- these two datapoints are represented by two 4-dimensional vectors

In [4]:
U = iris.iloc[15, :-1].to_numpy(dtype = 'float')
V = iris.iloc[106, :-1].to_numpy(dtype = 'float')
np.array([U, V])

array([[5.7, 4.4, 1.5, 0.4],
       [4.9, 2.5, 4.5, 1.7]])

#### example 1. 
find the most similar datapoint to $\vec{U}=$ iris.iloc[15]

In [5]:
# build the vector matrix of the iris dataset
I = iris.iloc[:, :-1].to_numpy(dtype = 'float')

In [6]:
# find distance of all vetors to U
distances_to_U = np.sqrt(np.sum((U -I)**2, axis = 1))

In [7]:
# find closest datapoint
closest_to_U = np.argmin(distances_to_U)
print('the closest datapoint is %d at a distance of %6.4f' %(closest_to_U, distances_to_U[closest_to_U]))
print('vector U    : %s' %U)
print('vector I[%d]: %s' %(closest_to_U, iris.iloc[closest_to_U].to_numpy()))
print('obviously, the closest datapoint to U is itself')

the closest datapoint is 15 at a distance of 0.0000
vector U    : [5.7 4.4 1.5 0.4]
vector I[15]: [5.7 4.4 1.5 0.4 'setosa']
obviously, the closest datapoint to U is itself


#### example 2. 
find the most similar datapoint to $\vec{U}=$ iris.iloc[15], excluding itself

In [8]:
# build the vector matrix of the iris dataset excluding datapoint 15
J = iris.drop(15, axis = 0).iloc[:, :-1].to_numpy(dtype = 'float')

In [9]:
# find distance of all vetors to U
d2U = np.sqrt(np.sum((U -J)**2, axis = 1))

In [10]:
# find the nearest neighbor
nnU = np.argmin(distances_to_U)
NNU = iris.drop(15, axis = 0).iloc[nnU].to_numpy()
print('nearest neighbor of U is datapoint %d at a distance of %6.4f' %(nnU, d2U[nnU]))
print('vector U    : %s' %U)
print('vector J[%d]: %s' %(nnU, NNU[ :-1]))
print('nearest neighbor of U belongs to class ... %s' %NNU[-1])

nearest neighbor of U is datapoint 15 at a distance of 0.6164
vector U    : [5.7 4.4 1.5 0.4]
vector J[15]: [5.4 3.9 1.3 0.4]
nearest neighbor of U belongs to class ... setosa


#### example 3. 
find the most similar datapoint to a new random observation

In [11]:
# get new random vetor
def new_vector(series):
    if series.dtype == 'float64':
        return series.min() +np.random.rand() *(series.max() -series.min())

In [12]:
# define new random observation
iris.apply(new_vector, axis = 0)

sepal_length    6.346408
sepal_width     4.310430
petal_length    4.061999
petal_width     1.008278
species              NaN
dtype: float64

In [13]:
# get new observation
X = iris.apply(new_vector, axis = 0)[:-1].to_numpy()
# find the nearest neighbor to unknown observation
nnX = np.argmin(np.sqrt(np.sum((X -I)**2, axis = 1)))
NNX = iris.iloc[nnX]
print('vector X    : %s' %np.round(X, 1))
print('NNX is I[%d]: %s' %(nnX, iris.iloc[nnX].to_numpy()[:-1]))
print('NNX belongs to class ... %s' %NNX[-1])

vector X    : [4.7 3.2 4.8 0.7]
NNX is I[84]: [5.4 3.0 4.5 1.5]
NNX belongs to class ... versicolor
