# An exploration of clustering techniques

## Aglomerative clustering with complete linkage


## April 2018

In [None]:
# Render our plots inline
%matplotlib inline

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

# 1.- Generate some sample data 

In [None]:
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']
X = np.random.random_sample([5,3])*10
df = pd.DataFrame(X, columns=variables, index=labels)
df

# 2.- Compute distance matrix

In [None]:
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
row_dist

# 3.- Compute linkage matrix

In [None]:
from scipy.cluster.hierarchy import linkage
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')

In [None]:
row_clusters

In [None]:
pd.DataFrame(row_clusters, columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'], \
             index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])

The first and second columns denote the most dissimilar members in each cluster, and the third row reports the distance between those members. The last column returns the count of the members in each cluster

# 4.- Dendrogram

In [None]:
from scipy.cluster.hierarchy import dendrogram
# make dendrogram black (part 1/2)
# from scipy.cluster.hierarchy import set_link_color_palette
# set_link_color_palette(['black'])
row_dendr = dendrogram(row_clusters, labels=labels,
    # make dendrogram black (part 2/2)
    # color_threshold=np.inf
    )
plt.tight_layout()
plt.ylabel('Euclidean distance')

# 5.- Attaching dendrogram to a heat map

In [None]:
fig = plt.figure(figsize=(12,12), facecolor='white')
axd = fig.add_axes([0.09,0.1,0.2,0.6])
row_dendr = dendrogram(row_clusters, orientation='left')

We reorder the data in our initial DataFrame according to the clustering labels that can be accessed from the dendrogram object

In [None]:
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]

In [None]:
df_rowclust

We construct the heat map from the reordered DataFrame and position it next to the dendrogram

In [None]:
axm = fig.add_axes([0.23,0.1,0.6,0.6])

In [None]:
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')

We will modify the aesthetics of the dendrogram by removing the axis ticks and hiding the axis spines. Also, we will add a color bar and assign the feature and sample names to the x and y axis tick labels, respectively

In [None]:
axd.set_xticks([])
axd.set_yticks([])

for i in axd.spines.values():
    i.set_visible(False)

fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))


Putting everything together:

In [None]:
fig = plt.figure(figsize=(12,12), facecolor='white')
axd = fig.add_axes([0.09,0.1,0.2,0.6])
row_dendr = dendrogram(row_clusters, orientation='left')

df_rowclust = df.iloc[row_dendr['leaves'][::-1]]

axm = fig.add_axes([0.23,0.1,0.6,0.6])
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')

axd.set_xticks([])
axd.set_yticks([])

for i in axd.spines.values():
    i.set_visible(False)
    
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))    