# Clustering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time

from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering
from sklearn.datasets.samples_generator import make_blobs
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn import datasets
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_mutual_info_score

%matplotlib inline

## 1. Iris dataset

### 1.1. k-means Clustering

In [None]:
## Load iris dataset
iris = datasets.load_iris()

In [None]:
iris.feature_names

In [None]:
iris.target

In [None]:
iris.target_names

In [None]:
## Make iris dataframe
data = pd.DataFrame(iris.data, columns = iris.feature_names)
data['target'] = iris.target

## Plot iris data with seaborn
sns.pairplot(data, hue = 'target')
plt.show()

In [None]:
X = iris.data
y = iris.target

In [None]:
KMeans?

In [None]:
k_set = range(2,10)
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for k in k_set:
    # Add model name
    names.append('KMeans_with_k=%d' % k)
    # Call model
    model = KMeans(n_clusters = k, max_iter = 30, n_init = 10, verbose = 1, n_jobs = 1)
#     models.append(model)
    # Get cluster IDs 
#     model.fit(X)
#     result = model.predict(X)
    result = model.fit_predict(X)
    
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
    silhouette_scores.append(silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    # 
    mutual_scores.append(adjusted_mutual_info_score(y, result))

In [None]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [None]:
silhouette_scores

In [None]:
mutual_scores

Clustering의 평가 방법으로 상당히 많은 것들이 존재합니다. 자세한 내용은 scikit-learn 공식 홈페이지 [Link](http://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)를 참고해주세요.

### 1.2. Hierarchical Agglomerative Clustering

In [None]:
k_set = range(2,10)
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for k in k_set:
    # Add model name
    names.append('HC_k=%d' % k)
    # Call model
    model = AgglomerativeClustering(n_clusters = k, affinity = 'euclidean', linkage = 'ward')
#     models.append(model)
    # Get cluster IDs 
    result = model.fit_predict(X)
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    silhouette_scores.append(silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    mutual_scores.append(adjusted_mutual_info_score(y, result))

In [None]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [None]:
silhouette_scores

In [None]:
mutual_scores

In [None]:
silhouette_scores.plot()

In [None]:
mutual_scores.plot()

- scikit-learn 내에 있는 AgglomerativeClustering은 dendrogram을 지원하지 않습니다.
- scipy를 활용하여 그림을 그려봅시다.

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
# generate the linkage matrix
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
Z = linkage(X, 'ward')

In [None]:
Z.shape

In [None]:
Z[0:5]

In [None]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    
)
plt.show()

In [None]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=12,  # show only the last p merged clusters
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    show_contracted = True
)
plt.show()

Use `fancy_dendrogram` function in [this page](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/)

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
fancy_dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    max_d = 15
)
plt.show()

## 참고 (in official website of `scikit-learn`)
- Comparing different clustering algorithms on toy datasets: [Link](http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py)
-  Empirical evaluation of the impact of k-means initialization: [Link](http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_stability_low_dim_dense.html)