In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt, cm
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_samples
from sklearn.model_selection import GridSearchCV
from yellowbrick.cluster import silhouette_visualizer
from scipy.cluster.hierarchy import dendrogram, ward, linkage

In [None]:
X = pd.read_csv("./dataset/X.csv").values
y = pd.read_csv("./dataset/y.csv").values
images = pd.read_csv("./dataset/images.csv").values


In [None]:
def shape_parameters(n_clusters):
    return dict(n_clusters=n_clusters)


def run_grid_kmeans(hyper_parameters, X_data):
    clusterer = KMeans()
    grid_search = GridSearchCV(
        estimator=clusterer,
        param_grid=hyper_parameters,
        cv=2
    )
    grid_search.fit(X_data)

    return grid_search


def display_dendrogram(X_data, linkage_type):
    linkage_array = linkage(X_data, linkage_type)
    # fig = plt.figure(figsize=(128, 32))
    dendro = dendrogram(linkage_array, truncate_mode='level', p=10)
    plt.show(dendro)


def display_images(image_collection, image_indices):
    fig, axes = plt.subplots(ncols=len(image_indices))
    for i in range(0, len(image_indices)):
        ax = axes[i]
        ax.imshow(image_collection[image_indices[i]].reshape(32, 32, 3))
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.tick_params(
            bottom=False, labelbottom=False,
            left=False, labelleft=False
        )
        ax.grid(False)
    plt.show()


def display_silhouette_plot(X_data, agg_labels):
    cluster_labels = np.unique(agg_labels)
    n_clusters = cluster_labels.shape[0]
    silhouette_values = silhouette_samples(
        X_data,
        agg_labels,
        metric='euclidean'
    )
    y_ax_lower, y_ax_upper = 0, 0
    y_ticks = []

    for i, c in enumerate(cluster_labels):
        c_silhouette_values = silhouette_values[agg_labels == c]
        c_silhouette_values.sort()
        y_ax_upper += len(c_silhouette_values)
        color = cm.jet(float(i) / n_clusters)
        plt.barh(
            range(y_ax_lower, y_ax_upper),
            c_silhouette_values,
            height=1.0,
            edgecolor='none',
            color=color
        )
        y_ticks.append((y_ax_lower + y_ax_upper) / 2.)
        y_ax_lower += len(c_silhouette_values)
    silhouette_average = np.mean(silhouette_values)
    plt.axvline(silhouette_average, color='red', linestyle='--')
    plt.yticks(y_ticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette Coefficient')
    plt.tight_layout()
    plt.show()


def index_from_value(value_list_primary, value_list_refined):
    indices = []
    for val in value_list_refined:
        indices.append(np.where(value_list_primary == val)[0][0])
    return indices

def get_core_indices(silhouette_values, pred_labels, core):
    core_silhouettes = silhouette_values[pred_labels == core]
    core_silhouettes = np.sort(core_silhouettes)[::-1][0:5]
    return index_from_value(silhouette_values, core_silhouettes)

def get_boundary_indices(silhouette_values, pred_labels, core):
    core_silhouettes = silhouette_values[pred_labels == core]
    boundary_silhouettes = list(filter(lambda _silhouette: -.001 < _silhouette < 0.001, core_silhouettes))
    return index_from_value(silhouette_values, boundary_silhouettes)

## Ward linkage

ward dendrogram

In [None]:
display_dendrogram(X, 'ward')


wrad silhouette

In [None]:
agg_ward = AgglomerativeClustering(
    n_clusters=5,
    linkage='ward'
)
y_agg_ward = agg_ward.fit_predict(X)


In [None]:
display_silhouette_plot(X_data=X, agg_labels=y_agg_ward)


## Singe-link
dendrogram

In [None]:
import sys
sys.setrecursionlimit(9000)
print(sys.getrecursionlimit())

display_dendrogram(X, 'single')
sys.setrecursionlimit(1000)


single silhouette

In [None]:
import sys
sys.setrecursionlimit(10000)

agg_single = AgglomerativeClustering(
    linkage='single'
)
y_agg_single = agg_single.fit_predict(X)

sys.setrecursionlimit(1000)


In [None]:
print(np.unique(y_agg_single))
# np.sum(y_agg_single[] == 0)
print(y_agg_single)

silhouette_values = silhouette_samples(
        X,
        y_agg_single,
        metric='euclidean'
)

c_silhouette_values = silhouette_values[y_agg_single == 1]
print(len(c_silhouette_values))

In [None]:
display_silhouette_plot(X_data=X, agg_labels=y_agg_single)


## Complete linkage

In [None]:
display_dendrogram(X, 'complete')

In [None]:
agg_complete = AgglomerativeClustering(
    n_clusters=5,
    linkage='complete'
)
y_agg_complete = agg_complete.fit_predict(X)

In [None]:
display_silhouette_plot(X_data=X, agg_labels=y_agg_complete)


In [None]:
# cluster_labels = np.unique(y_agg_ward)
# n_clusters = cluster_labels.shape[0]
silhouette_values = silhouette_samples(
    X,
    y_agg_complete,
    metric='euclidean'
)

seek silhouette values at core and boundary

finding the actual samples that are core and boundary

In [None]:
indices_at_core_0 = get_core_indices(silhouette_values, y_agg_complete, core=0)
print(indices_at_core_0)


In [None]:

display_images(images, indices_at_core_0)

# display_images(images, boundary_indices)

In [None]:
indices_at_core_1 = get_core_indices(silhouette_values, y_agg_complete, core=1)
print(indices_at_core_1)


In [None]:
display_images(images, indices_at_core_1)


In [None]:
indices_at_core_2 = get_core_indices(silhouette_values, y_agg_complete, core=2)
print(indices_at_core_2)

In [None]:
display_images(images, indices_at_core_2)

In [None]:
indices_at_core_3 = get_core_indices(silhouette_values, y_agg_complete, core=3)
print(indices_at_core_3)



In [None]:
display_images(images, indices_at_core_3)

In [None]:
indices_at_core_4 = get_core_indices(silhouette_values, y_agg_complete, core=4)
print(indices_at_core_4)

In [None]:
display_images(images, indices_at_core_4)

In [None]:
indices_at_boundary_1 = get_boundary_indices(silhouette_values=silhouette_values, pred_labels=y_agg_complete, core=1)
print(indices_at_boundary_1)

In [None]:
display_images(images, indices_at_boundary_1[5:10])


In [None]:
indices_at_boundary_0 = get_boundary_indices(silhouette_values=silhouette_values, pred_labels=y_agg_complete, core=0)
print(indices_at_boundary_0)


In [None]:
display_images(images, indices_at_boundary_0[0:2])


In [None]:
indices_at_boundary_1 = get_boundary_indices(silhouette_values=silhouette_values, pred_labels=y_agg_complete, core=1)
print(indices_at_boundary_1)


In [None]:
display_images(images, indices_at_boundary_1[0:5])

In [None]:
indices_at_boundary_3 = get_boundary_indices(silhouette_values=silhouette_values, pred_labels=y_agg_complete, core=3)
print(indices_at_boundary_3)


In [None]:
display_images(images, indices_at_boundary_3[0:5])

In [None]:
indices_at_boundary_4 = get_boundary_indices(silhouette_values=silhouette_values, pred_labels=y_agg_complete, core=4)
print(indices_at_boundary_4)


In [None]:
display_images(images, indices_at_boundary_4[0:5])

In [None]:
display_images(images, indices_at_boundary_4[5:10])