**Similarity, difference**

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

iris = load_iris()
iris_data=iris['data']

iris_data[0:5, :]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [2]:
#normalization
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
iris_norm = min_max_scaler.fit_transform(iris_data)
iris_norm[0:5, :]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

Differences interpreted on continuous value attributes

In [3]:
#Euclidean distance
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

dist=pdist(iris_norm[0:5, :], metric='euclidean')

squareform(dist)

#OR
#from sklearn.metrics.pairwise import euclidean_distances

#dist=euclidean_distances(iris_norm[0:5, :], iris_norm[0:5, :])
#dist

array([[0.        , 0.21561354, 0.16810102, 0.21761244, 0.0500771 ],
       [0.21561354, 0.        , 0.10157824, 0.09469862, 0.25153848],
       [0.16810102, 0.10157824, 0.        , 0.06047157, 0.18710825],
       [0.21761244, 0.09469862, 0.06047157, 0.        , 0.23671867],
       [0.0500771 , 0.25153848, 0.18710825, 0.23671867, 0.        ]])

In [4]:
#Manhattan distance
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

dist=pdist(iris_norm[0:5, :], metric='cityblock')

squareform(dist)

array([[0.        , 0.26388889, 0.25306026, 0.32250471, 0.06944444],
       [0.26388889, 0.        , 0.15583804, 0.14194915, 0.27777778],
       [0.25306026, 0.15583804, 0.        , 0.10334275, 0.26694915],
       [0.32250471, 0.14194915, 0.10334275, 0.        , 0.3363936 ],
       [0.06944444, 0.27777778, 0.26694915, 0.3363936 , 0.        ]])

In [5]:
#Mahalanobis distance
import numpy as np
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

cov=np.cov(iris_norm, rowvar=False)

dist=pdist(iris_norm[0:5, :], metric='mahalanobis', VI=np.linalg.inv(cov))

squareform(dist)

array([[0.        , 1.35445724, 0.96872982, 1.40572532, 0.58991096],
       [1.35445724, 0.        , 0.96979054, 1.4527546 , 1.81068895],
       [0.96872982, 0.96979054, 0.        , 0.71700087, 1.12534403],
       [1.40572532, 1.4527546 , 0.71700087, 0.        , 1.32932197],
       [0.58991096, 1.81068895, 1.12534403, 1.32932197, 0.        ]])

In [6]:
#normalization is not necessary for Mahalanobis distance, so result will be the same
import numpy as np
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

cov=np.cov(iris_data, rowvar=False)

dist=pdist(iris_data[0:5, :], metric='mahalanobis', VI=np.linalg.inv(cov))

squareform(dist)

array([[0.        , 1.35445724, 0.96872982, 1.40572532, 0.58991096],
       [1.35445724, 0.        , 0.96979054, 1.4527546 , 1.81068895],
       [0.96872982, 0.96979054, 0.        , 0.71700087, 1.12534403],
       [1.40572532, 1.4527546 , 0.71700087, 0.        , 1.32932197],
       [0.58991096, 1.81068895, 1.12534403, 1.32932197, 0.        ]])

Distance calculation based on binary attributes

In [7]:
#Similarity coefficient
from scipy.spatial.distance import hamming
obj1 = [1, 0, 1, 0, 0, 0]
obj2 = [1, 0, 1, 0, 1, 0]
dist=hamming(obj1, obj2)
dist

0.16666666666666666

In [8]:
#Jaccard coefficient (asymmetric case) - Dice distance
from scipy.spatial.distance import jaccard
obj1 = [1, 0, 1, 0, 0, 0]
obj2 = [1, 0, 1, 0, 1, 0]
dist=jaccard(obj1, obj2)
dist

0.3333333333333333

In [9]:
#Cosinus distance
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

A =  np.array([[10, 20, 0, 0], [30, 60, 0, 0], [60, 30, 0, 0], [0, 0, 10, 20]])

dist=pdist(A, metric='cosine')

#distance.squareform(dist)
print('Cosinus distance:\n {}\n'.format(squareform(dist)))

Cosinus distance:
 [[0.  0.  0.2 1. ]
 [0.  0.  0.2 1. ]
 [0.2 0.2 0.  1. ]
 [1.  1.  1.  0. ]]



In [10]:
#Simple similarity coefficient
from sklearn.metrics import jaccard_score
obj1 = [1, 0, 1, 0, 0, 0]
obj2 = [1, 0, 1, 0, 1, 0]
dist=1-jaccard_score(obj1, obj2)
dist

0.33333333333333337

Gower distance

In [11]:
!pip install gower

Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Installing collected packages: gower
Successfully installed gower-0.1.2


In [12]:
df = pd.DataFrame([[1,2.6,'A'],[12,5,'X'],[4,7,'A']])
df.columns = ['Num_1','Num_2','Cat_1']
df

Unnamed: 0,Num_1,Num_2,Cat_1
0,1,2.6,A
1,12,5.0,X
2,4,7.0,A


In [13]:
import gower
gower.gower_matrix(df)

array([[0.        , 0.8484848 , 0.4242424 ],
       [0.8484848 , 0.        , 0.72727275],
       [0.4242424 , 0.72727275, 0.        ]], dtype=float32)