# Q5

## Import the libraries

In [None]:
import scipy.spatial.distance as dist
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline

## Load the data

In [None]:
data = None
with open('Data' + os.sep + 'seeds.csv') as fp:
    data = [x.strip().split(',') for x in fp.readlines()]

headers = data[0]
class_field = len(headers)-1
data = [[int(x[i]) if i==class_field else float(x[i]) for i in range(len(x))] for x in data[1:]]
data = np.asarray(data)

print('Attributes - ')
print(', '.join(headers))
print()
print('Shape of loaded dataset - ')
print(data.shape)

## (A) Select the attributes and normalize/standardize

In [None]:
select_headers = [headers[0], headers[4]]
select = data[:,[0,4]]

print('Attributes - ')
print(', '.join(select_headers))
print()
print('Shape of subset - ')
print(select.shape)

In [None]:
print('[ORIGINAL] Range of values - ')
amin = np.amin(select, axis=0)
amax = np.amax(select, axis=0)
print('\t' + '\t'.join(select_headers))
print('min\t' + '\t'.join(['{0:.3f}'.format(x) for x in amin]))
print('max\t' + '\t'.join(['{0:.3f}'.format(x) for x in amax]))
print()

normal = np.copy(select)
normal = (normal - amin)/(amax - amin)

print('[NORMALIZED] Range of values - ')
amin = np.amin(normal, axis=0)
amax = np.amax(normal, axis=0)
amean = np.mean(normal, axis=0)
astd = np.std(normal, axis=0)
print('\t' + '\t'.join(select_headers))
print('min\t' + '\t'.join(['{0:.3f}'.format(x) for x in amin]))
print('max\t' + '\t'.join(['{0:.3f}'.format(x) for x in amax]))
print()

amean = np.mean(select, axis=0)
astd = np.std(select, axis=0)
standard = np.copy(select)
standard = (standard - amean)/astd

print('[STANDARDIZED] Range of values - ')
amin = np.amin(standard, axis=0)
amax = np.amax(standard, axis=0)
amean = np.mean(standard, axis=0)
astd = np.std(standard, axis=0)
print('\t' + '\t'.join(select_headers))
print('min\t' + '\t'.join(['{0:.3f}'.format(x) for x in amin]))
print('max\t' + '\t'.join(['{0:.3f}'.format(x) for x in amax]))

## (B) Distance Metrics

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_figheight(8)
fig.set_figwidth(24)

axes[0].set_title('[ORIGINAL]')
axes[0].scatter(select[:,0], select[:,1])

axes[1].set_title('[NORMALIZED]')
axes[1].scatter(normal[:,0], normal[:,1])

axes[2].set_title('[STANDARDIZED]')
axes[2].scatter(standard[:,0], standard[:,1])

plt.show()

### (i) Scatter Plot Analysis

TODO:

In [None]:
print('[ORIGINAL] Mean and Standard Deviation - ')
rmean = np.mean(select, axis=0)
rstd = np.std(select, axis=0)
print('\t' + '\t'.join(select_headers))
print('mean\t' + '\t'.join(['{0:.3f}'.format(x) for x in rmean]))
print('st.dev\t' + '\t'.join(['{0:.3f}'.format(x) for x in rstd]))
print()

print('[NORMALIZED] Mean and Standard Deviation - ')
nmean = np.mean(normal, axis=0)
nstd = np.std(normal, axis=0)
print('\t' + '\t'.join(select_headers))
print('mean\t' + '\t'.join(['{0:.3f}'.format(x) for x in nmean]))
print('st.dev\t' + '\t'.join(['{0:.3f}'.format(x) for x in nstd]))
print()

print('[STANDARDIZED] Mean and Standard Deviation - ')
smean = np.mean(standard, axis=0)
sstd = np.std(standard, axis=0)
print('\t' + '\t'.join(select_headers))
print('mean\t' + '\t'.join(['{0:.3f}'.format(abs(x)) for x in smean]))
print('st.dev\t' + '\t'.join(['{0:.3f}'.format(x) for x in sstd]))

fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_figheight(8)
fig.set_figwidth(24)

axes[0].set_title('[ORIGINAL] [MEAN IN RED]')
axes[0].scatter(select[:,0], select[:,1])
axes[0].scatter(rmean[0], rmean[1], s=100, c='red')

axes[1].set_title('[NORMALIZED] [MEAN IN RED]')
axes[1].scatter(normal[:,0], normal[:,1])
axes[1].scatter(nmean[0], nmean[1], s=100, c='red')

axes[2].set_title('[STANDARDIZED] [MEAN IN RED]')
axes[2].scatter(standard[:,0], standard[:,1])
axes[2].scatter(smean[0], smean[1], s=100, c='red')

plt.show()

### (ii) Mean and Standard Deviations

TODO:

In [None]:
eu = []
eu.append([dist.euclidean(rmean, x) for x in select])
eu.append([dist.euclidean(nmean, x) for x in normal])
eu.append([dist.euclidean(smean, x) for x in standard])
eu = np.asarray(eu)

mh = []
mh.append([dist.mahalanobis(rmean, x, np.cov(select.T)) for x in select])
mh.append([dist.mahalanobis(nmean, x, np.cov(normal.T)) for x in normal])
mh.append([dist.mahalanobis(smean, x, np.cov(standard.T)) for x in standard])
mh = np.asarray(mh)

cb = []
cb.append([dist.cityblock(rmean, x) for x in select])
cb.append([dist.cityblock(nmean, x) for x in normal])
cb.append([dist.cityblock(smean, x) for x in standard])
cb = np.asarray(cb)

mn3 = []
mn3.append([dist.minkowski(rmean, x, 3) for x in select])
mn3.append([dist.minkowski(nmean, x, 3) for x in normal])
mn3.append([dist.minkowski(smean, x, 3) for x in standard])
mn3 = np.asarray(mn3)

cy = []
cy.append([dist.chebyshev(rmean, x) for x in select])
cy.append([dist.chebyshev(nmean, x) for x in normal])
cy.append([dist.chebyshev(smean, x) for x in standard])
cy = np.asarray(cy)

cs = []
cs.append([dist.cosine(rmean, x) for x in select])
cs.append([dist.cosine(nmean, x) for x in normal])
cs.append([dist.cosine(smean, x) for x in standard])
cs = np.asarray(cs)

cnb = []
cnb.append([dist.canberra(rmean, x) for x in select])
cnb.append([dist.canberra(nmean, x) for x in normal])
cnb.append([dist.canberra(smean, x) for x in standard])
cnb = np.asarray(cnb)

In [None]:
eu = np.transpose(eu)
seu = np.copy(eu)
seu.sort(axis=0)
seu = seu[10]
eur = select[np.where(eu[:,0]<seu[0])[0]]
eun = normal[np.where(eu[:,1]<seu[1])[0]]
eus = standard[np.where(eu[:,2]<seu[2])[0]]

mh = np.transpose(mh)
smh = np.copy(mh)
smh.sort(axis=0)
smh = smh[10]
mhr = select[np.where(mh[:,0]<smh[0])[0]]
mhn = normal[np.where(mh[:,1]<smh[1])[0]]
mhs = standard[np.where(mh[:,2]<smh[2])[0]]

cb = np.transpose(cb)
scb = np.copy(cb)
scb.sort(axis=0)
scb = scb[10]
cbr = select[np.where(cb[:,0]<scb[0])[0]]
cbn = normal[np.where(cb[:,1]<scb[1])[0]]
cbs = standard[np.where(cb[:,2]<scb[2])[0]]

mn3 = np.transpose(mn3)
smn3 = np.copy(mn3)
smn3.sort(axis=0)
smn3 = smn3[10]
mn3r = select[np.where(mn3[:,0]<smn3[0])[0]]
mn3n = normal[np.where(mn3[:,1]<smn3[1])[0]]
mn3s = standard[np.where(mn3[:,2]<smn3[2])[0]]

cy = np.transpose(cy)
scy = np.copy(cy)
scy.sort(axis=0)
scy = scy[10]
cyr = select[np.where(cy[:,0]<scy[0])[0]]
cyn = normal[np.where(cy[:,1]<scy[1])[0]]
cys = standard[np.where(cy[:,2]<scy[2])[0]]

cs = np.transpose(cs)
scs = np.copy(cs)
scs.sort(axis=0)
scs = scs[10]
csr = select[np.where(cs[:,0]<scs[0])[0]]
csn = normal[np.where(cs[:,1]<scs[1])[0]]
css = standard[np.where(cs[:,2]<scs[2])[0]]

cnb = np.transpose(cnb)
scnb = np.copy(cnb)
scnb.sort(axis=0)
scnb = scnb[10]
cnbr = select[np.where(cnb[:,0]<scnb[0])[0]]
cnbn = normal[np.where(cnb[:,1]<scnb[1])[0]]
cnbs = standard[np.where(cnb[:,2]<scnb[2])[0]]

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3)
fig.set_figheight(56)
fig.set_figwidth(24)

axes[0,0].set_title('[ORIGINAL] [EUCLIDEAN] [MEAN IN RED]')
axes[0,0].scatter(eur[:,0], eur[:,1])
axes[0,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[0,1].set_title('[NORMALIZED] [EUCLIDEAN] [MEAN IN RED]')
axes[0,1].scatter(eun[:,0], eun[:,1])
axes[0,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[0,2].set_title('[STANDARDIZED] [EUCLIDEAN] [MEAN IN RED]')
axes[0,2].scatter(eus[:,0], eus[:,1])
axes[0,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[1,0].set_title('[ORIGINAL] [MAHALANOBIS] [MEAN IN RED]')
axes[1,0].scatter(mhr[:,0], mhr[:,1])
axes[1,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[1,1].set_title('[NORMALIZED] [MAHALANOBIS] [MEAN IN RED]')
axes[1,1].scatter(mhn[:,0], mhn[:,1])
axes[1,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[1,2].set_title('[STANDARDIZED] [MAHALANOBIS] [MEAN IN RED]')
axes[1,2].scatter(mhs[:,0], mhs[:,1])
axes[1,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[2,0].set_title('[ORIGINAL] [CITY BLOCK] [MEAN IN RED]')
axes[2,0].scatter(cbr[:,0], cbr[:,1])
axes[2,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[2,1].set_title('[NORMALIZED] [CITY BLOCK] [MEAN IN RED]')
axes[2,1].scatter(cbn[:,0], cbn[:,1])
axes[2,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[2,2].set_title('[STANDARDIZED] [CITY BLOCK] [MEAN IN RED]')
axes[2,2].scatter(cbs[:,0], cbs[:,1])
axes[2,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[3,0].set_title('[ORIGINAL] [MINKOWSKI (R=3)] [MEAN IN RED]')
axes[3,0].scatter(mn3r[:,0], mn3r[:,1])
axes[3,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[3,1].set_title('[NORMALIZED] [MINKOWSKI (R=3)] [MEAN IN RED]')
axes[3,1].scatter(mn3n[:,0], mn3n[:,1])
axes[3,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[3,2].set_title('[STANDARDIZED] [MINKOWSKI (R=3)] [MEAN IN RED]')
axes[3,2].scatter(mn3s[:,0], mn3s[:,1])
axes[3,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[4,0].set_title('[ORIGINAL] [CHEBYSHEV] [MEAN IN RED]')
axes[4,0].scatter(cyr[:,0], cyr[:,1])
axes[4,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[4,1].set_title('[NORMALIZED] [CHEBYSHEV] [MEAN IN RED]')
axes[4,1].scatter(cyn[:,0], cyn[:,1])
axes[4,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[4,2].set_title('[STANDARDIZED] [CHEBYSHEV] [MEAN IN RED]')
axes[4,2].scatter(cys[:,0], cys[:,1])
axes[4,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[5,0].set_title('[ORIGINAL] [COSINE] [MEAN IN RED]')
axes[5,0].scatter(csr[:,0], csr[:,1])
axes[5,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[5,1].set_title('[NORMALIZED] [COSINE] [MEAN IN RED]')
axes[5,1].scatter(csn[:,0], csn[:,1])
axes[5,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[5,2].set_title('[STANDARDIZED] [COSINE] [MEAN IN RED]')
axes[5,2].scatter(css[:,0], css[:,1])
axes[5,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

axes[6,0].set_title('[ORIGINAL] [CANBERRA] [MEAN IN RED]')
axes[6,0].scatter(cnbr[:,0], cnbr[:,1])
axes[6,0].scatter(rmean[0], rmean[1], s=100, c='red', marker='x')

axes[6,1].set_title('[NORMALIZED] [CANBERRA] [MEAN IN RED]')
axes[6,1].scatter(cnbn[:,0], cnbn[:,1])
axes[6,1].scatter(nmean[0], nmean[1], s=100, c='red', marker='x')

axes[6,2].set_title('[STANDARDIZED] [CANBERRA] [MEAN IN RED]')
axes[6,2].scatter(cnbs[:,0], cnbs[:,1])
axes[6,2].scatter(smean[0], smean[1], s=100, c='red', marker='x')

plt.show()