In [1]:
%matplotlib inline

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.cluster import AgglomerativeClustering, KMeans

# Agglomerative Clustering Demo

Wine data description in [here](https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset).

The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine.

**Objective** of the clustering analysis is to group the wines per cultivator based on the chemical data.

In [16]:
load_wine().keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [17]:
wine = load_wine()
attributes = wine.data
classes = wine.target

In [19]:
attributes.shape

(178, 13)

In [22]:
np.unique(classes)

array([0, 1, 2])

In [23]:
aggl = AgglomerativeClustering(n_clusters=3, )

In [24]:
predicted_classes = aggl.fit_predict(attributes)

In [41]:
def rename_unique(arr):
    unique_values = pd.unique(arr)
    mapping = {value: i for i, value in enumerate(unique_values)}
    new_arr = np.array([mapping[value] for value in arr])
    
    return new_arr

In [48]:
deltas = rename_unique(predicted_classes) - rename_unique(classes)
correct = pd.value_counts(deltas, normalize=True)[0]

0.47752808988764045

In [52]:
linkages = ['ward', 'complete', 'average', 'single']

for linkage in linkages:
    aggl = AgglomerativeClustering(n_clusters=3, linkage=linkage)
    predicted_classes = aggl.fit_predict(attributes)
    diffs = rename_unique(predicted_classes) - rename_unique(classes)
    accuracy = pd.value_counts(diffs, normalize=True)[0]
    print(f'{linkage} - {accuracy:.2f}')

ward - 0.48
complete - 0.48
average - 0.49
single - 0.30


In [55]:
km = KMeans(n_clusters=3, init='k-means++')
predicted_classes = km.fit_predict(attributes)
diffs = rename_unique(predicted_classes) - rename_unique(classes)
accuracy = pd.value_counts(diffs, normalize=True)[0]
print(f'k-means++ - {accuracy:.2f}')

k-means++ - 0.48


In [59]:
linkages = ['complete', 'average', 'single']
affinities = ["euclidean", "l1", "l2", "manhattan", "cosine"]

for linkage in linkages:
    for affinity in affinities:
        aggl = AgglomerativeClustering(n_clusters=3, linkage=linkage, affinity=affinity)
        predicted_classes = aggl.fit_predict(attributes)
        diffs = rename_unique(predicted_classes) - rename_unique(classes)
        accuracy = pd.value_counts(diffs, normalize=True)[0]
        print(f'{linkage} \t {affinity} {accuracy:.2f}')

complete 	 euclidean 0.48
complete 	 l1 0.48
complete 	 l2 0.48
complete 	 manhattan 0.48
complete 	 cosine 0.47
average 	 euclidean 0.49
average 	 l1 0.40
average 	 l2 0.49
average 	 manhattan 0.40
average 	 cosine 0.48
single 	 euclidean 0.30
single 	 l1 0.33
single 	 l2 0.30
single 	 manhattan 0.33
single 	 cosine 0.41
