# Q5

## Import the libraries

In [None]:
import scipy.spatial.distance as dist
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline

## Load the data

In [None]:
data = None
with open('Data' + os.sep + 'seeds.csv') as fp:
    data = [x.strip().split(',') for x in fp.readlines()]

headers = data[0]
class_field = len(headers)-1
data = [[int(x[i]) if i==class_field else float(x[i]) for i in range(len(x))] for x in data[1:]]
data = np.asarray(data)

print('Attributes - ')
print('\t'.join([x[:6] for x in headers]))
for i in range(len(data[:10])):
    print('\t'.join(['{0:.3f}'.format(x) for x in data[i]]))
print('...')
print(str(len(data)-10) + ' more rows.')

In [None]:
types = {'[ORIGINAL]': None, '[NORMALIZED]': None, '[STANDARDIZED]': None}
type_names = ['[ORIGINAL]', '[NORMALIZED]', '[STANDARDIZED]']
dists = {'[EUCLIDEAN]': dist.euclidean,
         '[MAHALANOBIS]': dist.mahalanobis,
         '[CITY BLOCK]': dist.cityblock,
         '[MINKOWSKI (R=3)]': dist.minkowski,
         '[CHEBYSHEV]': dist.chebyshev,
         '[COSINE]': dist.cosine,
         '[CANBERRA]': dist.canberra}
dist_funcs = (dist.euclidean, dist.mahalanobis, dist.cityblock, dist.minkowski, dist.chebyshev, dist.cosine, dist.canberra)
dist_names = ('[EUCLIDEAN]', '[MAHALANOBIS]', '[CITY BLOCK]', '[MINKOWSKI (R=3)]', '[CHEBYSHEV]', '[COSINE]', '[CANBERRA]')

## (A) Select the attributes and normalize/standardize

In [None]:
select_headers = [headers[0], headers[4]]
select = data[:,[0,4]]

print('Attributes - ')
print('\t'.join([x[:6] for x in select_headers]))
for i in range(len(select[:10])):
    print('\t'.join(['{0:.3f}'.format(x) for x in select[i]]))
print('...')
print(str(len(data)-10) + ' more rows.')

In [None]:
def printrange(val, headers, dtype):
    print(dtype + ' Range of values - ')
    amin = np.amin(val, axis=0)
    amax = np.amax(val, axis=0)
    print('\t' + '\t'.join(headers))
    print('min\t' + '\t'.join(['{0:.3f}'.format(x) for x in amin]))
    print('max\t' + '\t'.join(['{0:.3f}'.format(x) for x in amax]))
    print('range\t' + '\t'.join(['{0:.3f}'.format(x) for x in amax-amin]))
    print('\n')
    return amin, amax

In [None]:
amin, amax = printrange(select, select_headers, '[ORIGINAL]')
types['[ORIGINAL]'] = select

normal = np.copy(select)
normal = (normal - amin)/(amax - amin)
amin, amax = printrange(normal, select_headers, '[NORMALIZED]')
types['[NORMALIZED]'] = normal

amean = np.mean(select, axis=0)
astd = np.std(select, axis=0)
standard = np.copy(select)
standard = (standard - amean)/astd
amin, amax = printrange(standard, select_headers, '[STANDARDIZED]')
types['[STANDARDIZED]'] = standard

In [None]:
print('\t'.join(type_names))
print('\t'.join(['\t'.join([x[:6] for x in select_headers])]*3))
for i in range(len(select[:20])):
    t = []
    for y in type_names:
        t.append('\t'.join(['{0:.3f}'.format(x) for x in types[y][i]]))
    print('\t'.join(t))
print('...')
print(str(len(select)-20) + ' more rows.')

## (B) Distance Metrics

### (i) Scatter Plots

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_figheight(8)
fig.set_figwidth(24)

for i, data in enumerate(type_names):
    axes[i].set_title(data)
    axes[i].set_xlabel(select_headers[0])
    axes[i].set_ylabel(select_headers[1])
    axes[i].scatter(types[data][:,0], types[data][:,1], c='orange')

plt.show()

#### Analysis

TODO

### (ii) Mean values for all categories

In [None]:
def getmeanstd(val, headers, dtype):
    print(dtype + ' Mean and Standard Deviation - ')
    mean = np.mean(val, axis=0)
    std = np.std(val, axis=0)
    print('\t' + '\t'.join(headers))
    print('mean\t' + '\t'.join(['{0:.3f}'.format(x) for x in mean]))
    print('st.dev\t' + '\t'.join(['{0:.3f}'.format(x) for x in std]))
    print()
    return mean, std

In [None]:
mean = {x:None for x in types.keys()}
std = {x:None for x in types.keys()}

for data in types.keys():
    mean[data], std[data] = getmeanstd(types[data], select_headers, data)

fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_figheight(8)
fig.set_figwidth(24)

for i, data in enumerate(type_names):
    axes[i].set_title(data + ' [MEAN (X) IN BLACK]')
    axes[i].set_xlabel(select_headers[0])
    axes[i].set_ylabel(select_headers[1])
    axes[i].scatter(types[data][:,0], types[data][:,1], c='orange')
    axes[i].scatter(mean[data][0], mean[data][1], s=100, c='black', marker='x')

plt.show()

### (iii) Distance from mean for all categories over all distance metrics

In [None]:
def getdists(types, mean, dist_funcs, dist_names, type_names):
    t = [] 
    for i in range(len(dist_funcs)):
        ret = []
        for x in type_names:
            if dist_names[i]=='[MAHALANOBIS]':
                ret.append([dist_funcs[i](mean[x], y, np.cov(types[x].T)) for y in types[x]])
            elif dist_names[i]=='[MINKOWSKI (R=3)]':
                ret.append([dist_funcs[i](mean[x], y, 3) for y in types[x]])
            else:
                ret.append([dist_funcs[i](mean[x], y) for y in types[x]])
        ret = np.asarray(ret)
        ret = np.transpose(ret)
        t.append(ret)
    return t

In [None]:
alldists = getdists(types, mean, dist_funcs, dist_names, type_names)

### (iv) Top 10 nearest points for each distance metric

In [None]:
def gettop10s(alldists, type_names):
    t = []
    for x in alldists:
        d = {key:None for key in type_names}
        c = np.copy(x)
        c.sort(axis=0)
        c = c[9]
        for i in range(len(type_names)):
            d[type_names[i]] = np.where(x[:,i]<=c[i])[0]
        t.append(d)
    return t

In [None]:
top10s = gettop10s(alldists, type_names)

In [None]:
for i,x in enumerate(dist_names):
    print(x)
    for y in type_names:
        print('\t' + y + ' : ' + ', '.join([str(n) for n in top10s[i][y]]))
    print()

### (v) Plot top 10 nearest points

In [None]:
def gplot(axes, i, j, title, dist, mean, headers, original, labels):
    axes[i,j].set_title(title)
    axes[i,j].scatter(dist[:,0], dist[:,1], c='orange')
    axes[i,j].scatter(mean[0], mean[1], s=100, c='black', marker='x')
    axes[i,j].set_xlabel(headers[0])
    axes[i,j].set_ylabel(headers[1])
    for k,label in enumerate(labels):
        axes[i,j].annotate(label, (dist[k,0], dist[k,1]))

In [None]:
fig, axes = plt.subplots(nrows=len(alldists), ncols=len(type_names))
fig.set_figheight(56)
fig.set_figwidth(24)

for i in range(len(alldists)):
    for j in range(len(type_names)):
        gplot(axes, i, j, type_names[j] + ' ' + dist_names[i] + ' [MEAN IN BLACK]', types[type_names[j]][top10s[i][type_names[j]]], mean[type_names[j]], select_headers, types[type_names[j]], top10s[i][type_names[j]])

plt.show()

### (vi) Verification if the nearest points are similar across all distance metrics

### (vii) Results