### Basic imports and plotting functionality

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets

from matplotlib import pyplot as plt
import plotly.express as px
from plotly.offline import plot

import requests
from urllib.request import urlopen
from io import BytesIO
from io import StringIO

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
import persistable

# this library gives an easy way to create color palettes with
# very different colors. This is useful for coloring clusterings.
import glasbey

# a simple wrapper for glasbey
def cluster_colors(n_classes):
    return np.array(glasbey.create_palette(palette_size=n_classes))

# a function that will plot a 2D dataset and, optionally, color it according
# to a user given clustering. If no clustering is given, all data points are
# colored with the same color.
# Moreover, the cluster label is allowed to be -1. In that case the point is
# considered "unclustered" and is colored in gray
def plot_clustering(X, clustering = None, limits=None, sizes=[4, 2], noise_points=True, axis=True):
    if clustering is None:
        clustering = np.ones(X.shape[0], dtype=int)
    X_clustered = X[clustering != -1]
    X_unclustered = X[clustering == -1]
    clustering_clustered = clustering[clustering != -1]
    if noise_points:
        plt.scatter(
            X_unclustered[:, 0],
            X_unclustered[:, 1],
            s=sizes[1],
            c="grey",
        )
    plt.scatter(
        X_clustered[:, 0],
        X_clustered[:, 1],
        s=sizes[0],
        c=cluster_colors(max(clustering) + 1)[clustering_clustered],
    )
    if limits is not None:
        plt.xlim([limits[0],limits[1]])
        plt.ylim([limits[2],limits[3]])
    if not axis:
        plt.axis("Off")

# k-means

In [None]:
data_blobs = datasets.make_blobs(n_samples=500, n_features=2, random_state=10)[0]
plot_clustering(data_blobs, axis=False)

### k-means parameter selection with "elbow method"

In [None]:
def elbow_method(dataset, max_k):
    inertias = []
    for k in range(1,max_k+1):
        kmeans_clusterer = KMeans(n_clusters=k, n_init="auto")
        kmeans_clusterer.fit(dataset)
        inertias.append(kmeans_clusterer.inertia_)
    
    plt.figure(figsize=(5,3))
    plt.title("Inertia as function of k")
    _ = plt.plot(list(range(1, max_k+1)), inertias, "-o")

In [None]:
elbow_method(data_blobs, 10)

In [None]:
kmeans_clusterer = KMeans(n_clusters=3, n_init="auto")
plot_clustering(data_blobs, clustering=kmeans_clusterer.fit_predict(data_blobs), axis=False)

### k-means with far-away outliers

In [None]:
data_blobs_and_outlier = np.vstack((data_blobs, np.array([[100,100]])))
plot_clustering(data_blobs_and_outlier, axis=True)

In [None]:
elbow_method(data_blobs_and_outlier, 10)

In [None]:
kmeans_clusterer = KMeans(n_clusters=3, n_init="auto")
plot_clustering(data_blobs_and_outlier, kmeans_clusterer.fit_predict(data_blobs_and_outlier), axis=True)

In [None]:
kmeans_clusterer = KMeans(n_clusters=4, n_init="auto")
plot_clustering(data_blobs_and_outlier, kmeans_clusterer.fit_predict(data_blobs_and_outlier), axis=True)

### k-means when clusters are not convex

In [None]:
data_moons = datasets.make_moons(n_samples=500, noise=0.05, random_state=0)[0]
plot_clustering(data_moons, axis=False)

In [None]:
elbow_method(data_moons, 10)

In [None]:
kmeans_clusterer = KMeans(n_clusters=4, n_init="auto")
plot_clustering(data_moons, clustering=kmeans_clusterer.fit_predict(data_moons), axis=False)

# Rips clustering (Single linkage)

### SL on non-convex clusters

In [None]:
plot_clustering(data_moons, axis=False)

In [None]:
fig = plt.figure(figsize=(25, 10))
single_linkage_hierarchy = linkage(data_moons, "single")
dn = dendrogram(single_linkage_hierarchy, no_labels=True, orientation="right")

In [None]:
distance_scale = 0.2
plot_clustering(data_moons, fcluster(single_linkage_hierarchy, distance_scale, criterion="distance"))

### SL with far-away outliers

In [None]:
fig = plt.figure(figsize=(25, 10))
single_linkage_hierarchy = linkage(data_blobs_and_outlier, "single")
dn = dendrogram(single_linkage_hierarchy, no_labels=True, orientation="right")

In [None]:
dn = dendrogram(single_linkage_hierarchy, no_labels=True, orientation="right")
_ = plt.xlim([0,10])

In [None]:
distance_scale = 2
plot_clustering(data_blobs_and_outlier, fcluster(single_linkage_hierarchy, distance_scale, criterion="distance"))

### SL with scatter noise

In [None]:
data_moons_noisy = datasets.make_moons(n_samples=2000, noise=0.15, random_state=0)[0]
plot_clustering(data_moons_noisy)

In [None]:
fig = plt.figure(figsize=(25, 10))
single_linkage_hierarchy = linkage(data_moons_noisy, "single")
dn = dendrogram(single_linkage_hierarchy, no_labels=True, orientation="right")

In [None]:
distance_scale = 0.1
plot_clustering(data_moons_noisy, fcluster(single_linkage_hierarchy, distance_scale, criterion="distance"))

### SL with multiscale structure

In [None]:
data_blobs_2 = datasets.make_blobs(n_samples=20, centers=1, cluster_std = 5, n_features=2, random_state=1)[0]
data_blobs_different_scales = np.vstack((data_blobs, data_blobs_2 + 30))
plot_clustering(data_blobs_different_scales, axis=True)

In [None]:
fig = plt.figure(figsize=(25, 10))
single_linkage_hierarchy = linkage(data_blobs_different_scales, "single")
dn = dendrogram(single_linkage_hierarchy, no_labels=True, orientation="right")

In [None]:
distance_scale = 4
plot_clustering(data_blobs_different_scales, fcluster(single_linkage_hierarchy, distance_scale, criterion="distance"))

# Degree-Rips clustering (DBSCAN)

### DBSCAN with scatter noise

In [None]:
dbscan_clusterer = DBSCAN(eps=0.1, min_samples=10)
plot_clustering(data_moons_noisy, clustering=dbscan_clusterer.fit_predict(data_moons_noisy), axis=False)

### DBSCAN with variable density

In [None]:
url = "https://github.com/scikit-learn-contrib/hdbscan/blob/4052692af994610adc9f72486a47c905dd527c94/notebooks/clusterable_data.npy?raw=true"
f = urlopen(url) ; rf = f.read()
data_varying_density = np.load(BytesIO(rf))

plot_clustering(data_varying_density)

In [None]:
dbscan_clusterer = DBSCAN(eps=0.05, min_samples=50)
plot_clustering(data_varying_density, clustering=dbscan_clusterer.fit_predict(data_varying_density), axis=False)

# Persistable

### Persistable with scatter noise

In [None]:
p = persistable.Persistable(data_moons_noisy)
pi = persistable.PersistableInteractive(p)
pi.start_UI(inline=True)

In [None]:
plot_clustering(data_moons_noisy, clustering=pi.cluster())

### Persistable with variable density

In [None]:
p = persistable.Persistable(data_varying_density)
pi = persistable.PersistableInteractive(p)
pi.start_UI()

In [None]:
plot_clustering(data_varying_density, clustering=pi.cluster())

### k-means with density-sensitive task

In [None]:
# load uber pickup data from https://github.com/fivethirtyeight/uber-tlc-foil-response
url = "https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-apr14.csv"
response = requests.get(url)
as_pd = pd.read_csv(StringIO(response.text), sep=',')
data_uber = np.asarray(as_pd)[:, 1:3].astype(float)

# plot on map
fig = px.scatter_mapbox(
    lat=data_uber[:, 0], lon=data_uber[:, 1], zoom=9, mapbox_style="carto-positron")
_ = plot(fig, auto_open=True)

# dataset size
data_uber.shape

In [None]:
elbow_method(data_uber, 20)

In [None]:
# cluster and plot on map

kmeans_clusterer = KMeans(n_clusters=7, n_init="auto")
cluster_labels = kmeans_clusterer.fit_predict(data_uber)

# there must be a better way of doing this
df = pd.DataFrame(data_uber, columns=["lat", "lon"])
df["cluster"] = np.array(cluster_labels, dtype=str)
color_sequence = np.array(["#808080"] + list(cluster_colors(max(cluster_labels)+1)), dtype=str)
category_orders = {"cluster": np.array(range(-1,np.amax(cluster_labels) + 1), dtype=str)}
fig = px.scatter_mapbox(df, lat="lat", lon="lon", color="cluster", category_orders=category_orders,
                        color_discrete_sequence=color_sequence, zoom=9, mapbox_style="carto-positron")
_ = plot(fig, auto_open=True)

### Persistable with density-sensitive task

In [None]:
p = persistable.Persistable(data_uber, n_neighbors=300, subsample=30000)
pi = persistable.PersistableInteractive(p)
pi.start_UI()

Let `k` be the number returned in the previous cell. In a new window, enter the address `localhost:k` to access persistable interactively.

In [None]:
# cluster and plot on map
cluster_labels = pi.cluster()

# there must be a better way of doing this
df = pd.DataFrame(data_uber, columns=["lat", "lon"])
df["cluster"] = np.array(cluster_labels, dtype=str)
color_sequence = np.array(["#808080"] + list(cluster_colors(max(cluster_labels)+1)), dtype=str)
category_orders = {"cluster": np.array(range(-1,np.amax(cluster_labels) + 1), dtype=str)}
fig = px.scatter_mapbox(df, lat="lat", lon="lon", color="cluster", category_orders=category_orders,
                        color_discrete_sequence=color_sequence, zoom=9, mapbox_style="carto-positron")
_ = plot(fig, auto_open=True)

# To play around: olive oil data

Can you find a "better" clustering than k-means using one of the density-based clustering algorithms we saw today?

In [None]:
# data from "A Generalized Single Linkage Method for Estimating the Cluster Tree of a Density" Werner Stuetzle, Rebecca Nugent

from olive_oil_data import olive_oil_scaled
from olive_oil_data import olive_oil_areas

from sklearn.metrics import confusion_matrix

In [None]:
true_labels = np.array(olive_oil_areas)-1

kmeans_clusterer = KMeans(n_clusters=9, n_init="auto")
cl = kmeans_clusterer.fit_predict(olive_oil_scaled)
confusion_matrix(true_labels, cl)