In [2]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as sp_dist
import itertools

# Exercise 3.2
Consider two vectors: `v1 = c(1,2,3,4,5,6)`, `v2 = c(-4,-5,-6,-6,6,7)`. Compute the distance between them using:

* Euclidean metric
* Canberra metric
* Minkowski metrics (p = 2, p = 3)
* Manhattan metric
* Maximum metric

Compare the results. Explain the similarity between distances in Euclidean metrics and the Minkowski one for p = 2, and the Manhattan metric and the Minkowski one for p = 1. Why R-computation is feasible even if the length of vectors is different?


In [3]:
v1 = np.asarray([1, 2, 3, 4, 5, 6])
v2 = np.asarray([-4, -5, -6, -6, 6, 7])

# framework computations
sp_euclidean = sp_dist.euclidean(v1, v2)
sp_canberra = sp_dist.canberra(v1, v2)
sp_minkowski_p2 = sp_dist.minkowski(v1, v2, p=2)
sp_minkowski_p3 = sp_dist.minkowski(v1, v2, p=3)
sp_manhattan = sp_dist.cityblock(v1, v2)
sp_maximum = sp_dist.chebyshev(v1, v2)

In [12]:
# own implementations
def euclidean(v1, v2):
    """Computes the euclidean distance between vectors v1 and v2"""
    return np.sqrt(np.sum(np.square(v1 - v2)))

def manhattan(v1, v2):
    """Computes the manhattan (city block) distance between vectors v1 and v2"""
    return np.sum(np.abs(v1 - v2))

def minkowski(v1, v2, p=2):
    """Computes minkowski distance between vecotrs v1 and v2"""
    return np.power(np.sum(np.power(np.abs(v1 - v2), p)), 1/p)

own_euclidean = euclidean(v1, v2)
own_manhattan = manhattan(v1, v2)
own_minkowski_p2 = minkowski(v1, v2, 2)
own_minkowski_p3 = minkowski(v1, v2, 3)

In [19]:
# present the results
print('Framework results\t: euclidean={:.2f}, manhattan={:.2f}, minkowski_p2={:.2f}, minkowski_p3={:.2f}'.format(
    sp_euclidean, sp_manhattan, sp_minkowski_p2, sp_minkowski_p3
))
print('Own results\t\t: euclidean={:.2f}, manhattan={:.2f}, minkowski_p2={:.2f}, minkowski_p3={:.2f}'.format(
    own_euclidean, own_manhattan, own_minkowski_p2, own_minkowski_p3
))

Framework results	: euclidean=16.03, manhattan=33.00, minkowski_p2=16.03, minkowski_p3=13.00
Own results		: euclidean=16.03, manhattan=33.00, minkowski_p2=16.03, minkowski_p3=13.00


## Exercise 3.4

Let the following ordinal attributes be given:

cloud.cover, levels: clear, scattered, broken, overcast
precipitation, levels: none, light, moderate, heavy, extreme
wind, levels: calm, light (breeze), moderate (breeze), strong (breeze), gale, storm
temperature, levels: cold, chilly, normal, warm, heat
In all cases the levels are given from the smallest to the largest.

The following objects contains information about weather conditions in Krakow (8 days of November):

| cloud.cover | precipitation | Wind | Temperature |
|:-----------:|:-------------:|:----:|:-----------:|
|overcast	|light	|strong	|chilly|
|overcast	|heavy	|moderate	|chilly|
|scattered	|light|	light|	cold|
|broken|	none|	light	|cold|
|broken	|light	|light	|cold|
|overcast	|moderate|	light	|chilly|
|scattered	|none|	calm	|chilly|
|overcast|	light	|calm	|normal|

1. Define 4 factors for attributes: cloud.cover, precipitation, wind, and temperature.
2. Define weather data frame that contains data from the above table.
3. Determine the dissimilarity matrix for these objects using Euclidean metric.

## Exercise 3.9

Consider the following table briefly describing 3 Polish cities in the 'Pomeranian Voivodeship'. Compute both the symmetric and antisymmetric dissimilarities between them. Find the most simmilar cities among them.

|City\Property | >250.000 people | German location | History > 1000 year |Part of 'Trójmiasto' |
|:---------:|:---------------:|:--------------:|:----------------:|---------------|
|     Gdańsk      |        1         |             1   |         1         |    1           |
|     Gdynia      |           1      |  0              |  0                |1               |
|     Sopot      |           0      |    1            |  0                |  1             |


In [14]:
# First, lets define our data
gdansk = [1, 1, 1 ,1]
gdynia = [1, 0, 0, 1]
sopot = [0, 1, 0, 1]
cities = [gdansk, gdynia, sopot]

cities_df = pd.DataFrame([gdansk, gdynia, sopot], columns=['>250k', 'german location', 'history>1000', 'part of trojmiasto'])

In [17]:
# Next, lets define our similarity function
def calc_dissimilarity(vec_1, vec_2, mode='symmetric'):
    vec_1 = np.asarray(vec_1, dtype=bool)
    vec_2 = np.asarray(vec_2, dtype=bool)
    f_11 = sum(vec_1&vec_2)
    f_10 = sum(vec_1&np.logical_not(vec_2))
    f_01 = sum(np.logical_not(vec_1)&vec_2)
    if mode == 'symmetric':
        f_00 = sum(np.logical_not(vec_1|vec_2))
        similarity = (f_01+f_10)/(f_11+ f_10+ f_01+ f_00)
    elif mode == 'asymmetric':
        similarity = (f_01+f_10)/(f_11+ f_10+ f_01)
    return similarity

In [15]:
#Finally, lets compute the matrices for symmetrical and assymetrical dissimilarity
disymm_symmetric_matrix = np.zeros((3, 3))
for i, j in itertools.product([0, 1, 2], [0, 1, 2]):
    disymm_symmetric_matrix[i, j] = calc_dissimilarity(cities_df.iloc[i], cities_df.iloc[j], 'symmetric')
disymm_symmetric_matrix  

array([[0. , 0.5, 0.5],
       [0.5, 0. , 0.5],
       [0.5, 0.5, 0. ]])

In [16]:
disymm_asymmetric_matrix = np.zeros((3, 3))
disymm_asymmetric_matrix
for i, j in itertools.product([0, 1, 2], [0, 1, 2]):
    disymm_asymmetric_matrix[i, j] = calc_dissimilarity(cities_df.iloc[i], cities_df.iloc[j], 'asymmetric')
disymm_asymmetric_matrix  

array([[0.        , 0.5       , 0.5       ],
       [0.5       , 0.        , 0.66666667],
       [0.5       , 0.66666667, 0.        ]])