# Demo: K-means

Introduce a scroll bar to control the number of clusters

In [16]:
%matplotlib qt
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, Button, RadioButtons

import time
import warnings
from itertools import cycle, islice

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

## Generate datasets

In [17]:
n_samples = 1000
seed = 30

noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05, random_state=seed)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=seed)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=seed)
rng = np.random.RandomState(seed)
no_structure = rng.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)

datasets = {
    "data 1":(
        noisy_circles,
        {
            "damping": 0.77,
            "preference": -240,
            "quantile": 0.2,
            "n_clusters": 2,
            "min_samples": 7,
            "xi": 0.08,
        },
    ),
    "data 2":(
        noisy_moons,
        {
            "damping": 0.75,
            "preference": -220,
            "n_clusters": 2,
            "min_samples": 7,
            "xi": 0.1,
        },
    ),
    "data 3":(
        varied,
        {
            "eps": 0.18,
            "n_neighbors": 2,
            "min_samples": 7,
            "xi": 0.01,
            "min_cluster_size": 0.2,
        },
    ),
    "data 4":(
        aniso,
        {
            "eps": 0.15,
            "n_neighbors": 2,
            "min_samples": 7,
            "xi": 0.1,
            "min_cluster_size": 0.2,
        },
    ),
    "data 5":(blobs, {"min_samples": 7, "xi": 0.1, "min_cluster_size": 0.2}),
    "data 6":(no_structure, {}),
}

# Initiate classifier

In [18]:
default_base = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 3,
    "n_clusters": 2,
    "min_samples": 7,
    "xi": 0.05,
    "min_cluster_size": 0.1,
    "allow_single_cluster": True,
    "hdbscan_min_cluster_size": 15,
    "hdbscan_min_samples": 3,
    "random_state": None,
}

def get_clustering_algorithms(algo_params):
    params = default_base.copy()
    params.update(algo_params)

    two_means = cluster.MiniBatchKMeans(
            n_clusters=params["n_clusters"],
            random_state=params["random_state"],
        )

    spectral = cluster.SpectralClustering(
            n_clusters=params["n_clusters"],
            eigen_solver="arpack",
            affinity="nearest_neighbors",
            random_state=params["random_state"],
        )

    #dbscan = cluster.DBSCAN(eps=params["eps"])

    # hdbscan = cluster.HDBSCAN(
    #         min_samples=params["hdbscan_min_samples"],
    #         min_cluster_size=params["hdbscan_min_cluster_size"],
    #         allow_single_cluster=params["allow_single_cluster"],
    #     )

    # optics = cluster.OPTICS(
    #         min_samples=params["min_samples"],
    #         xi=params["xi"],
    #         min_cluster_size=params["min_cluster_size"],
    #     )

    # affinity_propagation = cluster.AffinityPropagation(
    #         damping=params["damping"],
    #         preference=params["preference"],
    #         random_state=params["random_state"],
    #     )

    birch = cluster.Birch(n_clusters=params["n_clusters"])

    gmm = mixture.GaussianMixture(
            n_components=params["n_clusters"],
            covariance_type="full",
            random_state=params["random_state"],
        )

    clustering_algorithms = {
            "MiniBatch\nKMeans":two_means,
            #"Affinity\nPropagation":affinity_propagation,
            "Spectral\nClustering":spectral,
            #"DBSCAN":dbscan,
            #"HDBSCAN":hdbscan,
            #"OPTICS":optics,
            "BIRCH":birch,
            "Gaussian\nMixture":gmm,
    }

    return clustering_algorithms

## Control panel

In [19]:
def colors_from_lbs(lbs, colors=None):
    mpl_20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
          '#3397dc', '#ff993e', '#3fca3f', '#df5152', '#a985ca',
          '#ad7165', '#e992ce', '#999999', '#dbdc3c', '#35d8e9']
    
    if colors is None:
        colors = np.array(mpl_20)
    else:
        colors = np.array(colors)
    lbs = np.array(lbs) % len(colors)
    #print(lbs)
    return colors[lbs]

In [20]:
fig, ax = plt.subplots()
plt.subplots_adjust(left=0.25, bottom=0.25)
control_element_list = []

dataset, algo_params = datasets[list(datasets.keys())[0]]
clustering_algorithms = get_clustering_algorithms(algo_params=algo_params)
algorithm = clustering_algorithms[list(clustering_algorithms.keys())[0]]

X, _ = dataset

x0, y0 = np.mean(X[:,0]), np.mean(X[:,1])
_vx = np.max(np.abs(X[:,0]-x0))
_vy = np.max(np.abs(X[:,1]-y0))
X[:,0] = (X[:,0]-x0)/_vx
X[:,1] = (X[:,1]-y0)/_vx

algorithm.fit(X)
if hasattr(algorithm, "labels_"):
    y_pred = algorithm.labels_.astype(int)
else:
    y_pred = algorithm.predict(X)
 

pdata = plt.scatter(X[:,0], X[:,1], c=colors_from_lbs(y_pred))
plt.title(list(clustering_algorithms.keys())[0])
plt.axis([-1.1, 1.1, -1.1, 1.1])
plt.xticks([-1,0,1])
plt.yticks([-1,0,1])

axcolor = 'lightgoldenrodyellow'
rax_dataset = plt.axes([0.025, 0.24, 0.15, 0.2], facecolor=axcolor)
s_dataset = RadioButtons(rax_dataset, ["{0}".format(name) for name in datasets], active=0)
dataset, algo_params = datasets[s_dataset.value_selected]

rax_algorithm = plt.axes([0.025, 0.55, 0.15, 0.3], facecolor=axcolor)
s_algorithm = RadioButtons(rax_algorithm, ["{0}".format(name) for name in clustering_algorithms], active=0)


def dataset_func(label):
    dataset, algo_params = datasets[label]
    clustering_algorithms = get_clustering_algorithms(algo_params=algo_params)
    algorithm = clustering_algorithms[s_algorithm.value_selected]
    
    #fig.clear()
    X, _ = dataset
    x0, y0 = np.mean(X[:,0]), np.mean(X[:,1])
    _vx = np.max(np.abs(X[:,0]-x0))
    _vy = np.max(np.abs(X[:,1]-y0))
    X[:,0] = (X[:,0]-x0)/_vx
    X[:,1] = (X[:,1]-y0)/_vx

    algorithm.fit(X)
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)
    pdata.set_offsets(X)
    pdata.set_color(colors_from_lbs(y_pred))

    #pdata = plt.scatter(X[:,0], X[:,1], c=colors_from_lbs(y_pred))
    # plt.axis([-_vx, _vx, -_vy, _vy])
    # plt.xticks([-_vx,0,_vx])
    # plt.yticks([-_vy,0,_vy])
    fig.canvas.draw()
s_dataset.on_clicked(dataset_func)
#control_element_list.append(s_dataset)

def algorithm_func(label):
    # dataset, algo_params = datasets[label]
    clustering_algorithms = get_clustering_algorithms(algo_params=algo_params)
    algorithm = clustering_algorithms[label]
    
    #fig.clear()
    # X, _ = dataset
    # x0, y0 = np.mean(X[:,0]), np.mean(X[:,1])
    # _vx = np.max(np.abs(X[:,0]-x0))
    # _vy = np.max(np.abs(X[:,1]-y0))
    # X[:,0] = (X[:,0]-x0)/_vx
    # X[:,1] = (X[:,1]-y0)/_vx

    algorithm.fit(X)
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)
    # pdata.set_offsets(X)
    pdata.set_color(colors_from_lbs(y_pred))

    ax.set_title(label)

    #pdata = plt.scatter(X[:,0], X[:,1], c=colors_from_lbs(y_pred))
    # plt.axis([-_vx, _vx, -_vy, _vy])
    # plt.xticks([-_vx,0,_vx])
    # plt.yticks([-_vy,0,_vy])
    fig.canvas.draw()

s_algorithm.on_clicked(algorithm_func)
#control_element_list.append(s_algorithm)


# if name == "MiniBatch\nKMeans":
#     #axcolor = 'lightgoldenrodyellow'
ax_n_cluster = plt.axes([0.25, 0.1, 0.65, 0.03], facecolor=axcolor)
#ax_random_state = plt.axes([0.25, 0.15, 0.65, 0.03], facecolor=axcolor)
allowed_amplitudes = [i for i in range(2,10+1)]
s_n_clusters = Slider(ax_n_cluster, 'N. of clusters', 2, 10, valinit=2, valstep=allowed_amplitudes)
#s_random_state = Slider(ax_random_state, 'Random state', 1, 100, valinit=40)
control_element_list.append(s_n_clusters)

def update(val):
    algorithm.n_clusters = s_n_clusters.val
    #algorithm.random_state = s_random_state.val 
    algorithm.fit(X)
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)

    pdata.set_color(colors_from_lbs(y_pred))
    fig.canvas.draw_idle()

s_n_clusters.on_changed(update)
#s_random_state.on_changed(update)

resetax = plt.axes([0.8, 0.025, 0.1, 0.04])
button_reset = Button(resetax, 'Reset', color=axcolor, hovercolor='0.975')

def reset(event):
    for control_element in control_element_list:
        control_element.reset()

    # clustering_algorithms = get_clustering_algorithms(algo_params=algo_params)
    # name, algorithm = clustering_algorithms[0]

    X, _ = dataset
    algorithm.fit(X)
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)

    pdata.set_color(colors_from_lbs(y_pred))
    fig.canvas.draw_idle()

button_reset.on_clicked(reset)

# rax = plt.axes([0.025, 0.5, 0.15, 0.15], facecolor=axcolor)
# radio = RadioButtons(rax, ('red', 'blue', 'green'), active=0)

plt.show()


  super()._check_params_vs_input(X, default_n_init=3)


  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_i