## Lab 9: Hierarchical and Spectral Clustering
You can use external libraries for linear algebra operations but you are expected to write your own algorithms.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

# Exercise 1
Use the ```iris``` dataset available at [this link](https://archive.ics.uci.edu/dataset/53/iris).
- Apply **your own** PCA function to the dataset. Plot the data in two dimensions, coloring by class.

In [None]:
iris = pd.read_csv("../Datasets/iris.data", sep=",", header=None, names = ["sepal_lenght", "sepal_width", "petal_lenght", "petal_width", "class"])

# Attribute Information:
# 0. sepal length in cm
# 1. sepal width in cm
# 2. petal length in cm
# 3. petal width in cm
# 4. class: 
#     -- Iris Setosa
#     -- Iris Versicolour
#     -- Iris Virginica


In [None]:
print(iris.shape)
print(iris)

In [None]:
label = iris["class"]
X = iris.drop("class", axis=1)

In [None]:
encoder = OrdinalEncoder()
y=np.array(label)
encoder.fit(y.reshape(-1,1))
y = encoder.transform(y.reshape(-1,1))

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
X_new = pca.fit_transform(X)

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_new[:,0], X_new[:,1], c=y, s = 40)
plt.title("Iris dataset (PCA)")
plt.xlabel("1st PC")
plt.ylabel("2nd PC")
plt.show()

- Perform hierarchical clustering with single linkage and Ward's linkage using [```scipy``` functions](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree

In [None]:
Z_s = linkage(X, 'single')
Z_s[:5] #[idx1, idx2, dist, sample_count]

In [None]:
Z_w = linkage(X, 'ward')
Z_w[:5] #[idx1, idx2, dist, sample_count]

- Plot the dendogram and the datapoints in 2D, coloring now by cluster (cut the dendogram to have the same number of clusters of the ground truth). 

In [None]:
plt.figure(figsize=(25, 10))
plt.title('Single linkage')
plt.xlabel('index')
plt.ylabel('distance')
dendrogram(Z_s, leaf_rotation=90.,leaf_font_size=8.)
plt.show()

In [None]:
plt.title('Single linkage (truncated)')
plt.xlabel('# points')
plt.ylabel('distance')
dendrogram(Z_s, truncate_mode='lastp', p=3, show_leaf_counts=True, leaf_rotation=90., leaf_font_size=12.)
plt.show()

In [None]:
clusters = cut_tree(Z_s, n_clusters=3)
clusters

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_new[:,0], X_new[:,1], c=clusters, s = 40)
plt.title("Single linkage clusters")
plt.xlabel("1st PC")
plt.ylabel("2nd PC")
plt.show()

In [None]:
plt.figure(figsize=(25,10))
plt.title('Ward\'s linkage')
plt.xlabel('index')
plt.ylabel('distance')
dendrogram(Z_w,leaf_rotation=90.,leaf_font_size=8.)
plt.show()

In [None]:
plt.title('Ward\'s linkage (truncated)')
plt.xlabel('# points')
plt.ylabel('distance')
dendrogram(Z_w, truncate_mode='lastp', p=3, show_leaf_counts=True, leaf_rotation=90., leaf_font_size=12.)
plt.show()

In [None]:
clusters = cut_tree(Z_w, n_clusters=3)
clusters

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_new[:,0], X_new[:,1], c=clusters, s = 40)
plt.title("Ward\'s linkage clusters")
plt.xlabel("1st PC")
plt.ylabel("2nd PC")
plt.show()

- Implement your own version of spectral clustering 
- Apply it to the dataset, setting $k_C$ to be the same of the ground truth (build the graph with $k$-NN having $k_{NN}=5$). Plot the dataspoints in 2D, coloring by cluster.

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
sc = SpectralClustering(n_clusters=3, #k_c
                        affinity='nearest_neighbors', 
                        n_neighbors=5) #k_nn
sc.fit(X)

In [None]:
clusters = sc.labels_

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_new[:,0], X_new[:,1], c=clusters, s = 40)
plt.title("Spectral clustering")
plt.xlabel("1st PC")
plt.ylabel("2nd PC")
plt.show()