# SDA - lecture 13 - Clustering

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Use a fixed seed
np.random.seed(42)

from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

import scipy.cluster.hierarchy as shc
from scipy.spatial.distance import cdist

%matplotlib inline

### Hierarchical clustering using sklearn

Author: Yuval Samoilov-Katz

Cluster the students based on their grades.

Utilize different linkage criteria to generate different hierarchies

In [None]:
X = np.random.randint(60, 100, size=(10, 10))
students = ['Shir', 'Nadav','Yuval','Yarden', 'Noam',
            'Yonatan','Adi','Ori', 'Tal','Shuki']
df = pd.DataFrame(data=X, columns=students)
display(df.head(3))

fig, axes = plt.subplots(figsize=(12,8), nrows=1, ncols=3)

methods = ['average','complete','ward']
exp = ['Average Distances','Maximum Distances','Minimium Variance']

# Iterate over different linkage criteria 
for i,m in enumerate(methods):
    dend1 = shc.dendrogram(shc.linkage(X.T, method=m),labels=students , orientation='right',ax=axes[i])
    axes[i].title.set_text(f'{exp[i]} Hierarchy')
axes[1].set_title(f'{exp[1]} Hierarchy')
plt.tight_layout()

### DBSCAN using sklearn

Author: Yuval Samoilov-Katz

* Credit to sklearn demo + few adjustmenrs for exploring epsilons

In [None]:
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

epes = [.1,.2,.3]

for e in epes: 
    #fit the model
    db = DBSCAN(eps=e, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(f'eplsilon:{e}')
    print(f'Estimated number of clusters: {n_clusters_}')
    print(f'Estimated number of noise points: {n_noise_}')
    print("Homogeneity: {:.3f}".format(metrics.homogeneity_score(labels_true, labels)))

    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]

    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=14)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=6)

    plt.title(f'epsilon:{e}\nEstimated number of clusters: {n_clusters_}')
    plt.show()