# Agglomerative Clustering
### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Library Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt

### Data Loading


In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Sample Splitting

In [None]:
(train, test) = train_test_split(train_data, test_size=0.2)
train = pd.DataFrame(train)
test = pd.DataFrame(test)

### Agglomerative Clustering
Nearest points by distance are combined



In [None]:
clusters_total = 100
aglo = AgglomerativeClustering(n_clusters=clusters_total, linkage="single")
# for constructing a dendrogram
# aglo = AgglomerativeClustering(n_clusters=None, distance_threshold=0, linkage="single")
train["target_cluster"] = aglo.fit_predict(train)

In [None]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
    dendrogram(linkage_matrix, **kwargs)

In [None]:
#plt.title('Hierarchical Dendrogram')
#plot_dendrogram(aglo)
#plt.xlabel("Number of data objects")
#plt.show()

Calculate the "purity" of clustering: how popular is the most popular class in each cluster

In [None]:
groups = train.groupby(["target_cluster","target"]).count()["total_area"]
clusters_items = [0]*clusters_total
clusters_popular = [0]*clusters_total
clusters_class = [0]*clusters_total
for group in groups.iteritems():
    print (group)
    items = group[1]
    cluster = group[0][0]
    clusters_items[cluster] += items
    if items > clusters_popular[cluster]:
        clusters_popular[cluster] = items
        clusters_class[cluster] = group[0][1]
purity = 0
for i in range(0, clusters_total):
    purity += clusters_popular[i] / clusters_items[i] / clusters_total
print ("Purity: ", purity) # 0.83

((0, 1), 4)
((0, 3), 2)
((0, 4), 1)
((0, 5), 7)
((1, 1), 4)
((1, 2), 1)
((1, 3), 12)
((1, 4), 15)
((1, 5), 36)
((2, 5), 2)
((3, 1), 1)
((3, 3), 2)
((3, 4), 3)
((3, 5), 6)
((4, 1), 6)
((4, 2), 8)
((4, 3), 19)
((4, 4), 29)
((4, 5), 111)
((5, 3), 1)
((5, 5), 2)
((6, 1), 1)
((6, 4), 2)
((6, 5), 2)
((7, 1), 58005)
((7, 2), 64721)
((7, 3), 61979)
((7, 4), 39862)
((7, 5), 59833)
((8, 1), 3)
((8, 2), 1)
((8, 3), 6)
((8, 4), 1)
((8, 5), 13)
((9, 2), 1)
((9, 3), 1)
((9, 4), 1)
((9, 5), 14)
((10, 2), 1)
((10, 4), 1)
((10, 5), 8)
((11, 1), 1)
((11, 2), 1)
((11, 4), 1)
((11, 5), 2)
((12, 1), 1)
((12, 2), 1)
((12, 3), 1)
((12, 5), 10)
((13, 3), 1)
((13, 5), 4)
((14, 2), 1)
((14, 5), 1)
((15, 5), 3)
((16, 2), 3)
((16, 3), 1)
((16, 4), 1)
((16, 5), 10)
((17, 1), 3)
((17, 3), 10)
((17, 4), 6)
((17, 5), 13)
((18, 1), 4)
((18, 2), 5)
((18, 3), 9)
((18, 4), 12)
((18, 5), 61)
((19, 2), 1)
((19, 3), 2)
((19, 4), 1)
((19, 5), 1)
((20, 1), 2)
((20, 2), 3)
((20, 3), 5)
((20, 4), 4)
((20, 5), 25)
((21, 2), 1)
(