In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.datasets import get_rdataset
from sklearn.preprocessing import StandardScaler
from ISLP import load_data

In [None]:
from sklearn.cluster import \
     (KMeans,
      AgglomerativeClustering)
from scipy.cluster.hierarchy import \
     (dendrogram,
      cut_tree)
from ISLP.cluster import compute_linkage

In [None]:
# Load the data

Auto = load_data('Auto')
Auto

In [None]:
# We can also impute the missing values of "horsepower" as we did previously

Auto['horsepower'].replace('?','104',inplace=True)
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'])
Auto.dtypes

In [None]:
# We won't use "name" in clustering--there are way too many levels to be useful. 
# Instead, we'll make the name of each car the index, so that we can use it in dendrograms.

Auto.index = Auto['name']
Auto.drop(columns='name', axis=1, inplace=True)
Auto

In [None]:
# Let's use just a sample of 10 vehicles to start
# We'll use this to get a sense of what the clusters seem to look like

import random
random.seed(10) # ALWAYS set the seed! Otherwise we can't replicate our results

# Create a random sample by randomly choosing row indices
index = random.sample(range(Auto.shape[0]), 10)

sample10 = Auto.iloc[index]
sample10

## Hierarchical Clustering

In [None]:
# Let's do an initial clustering exercise on the Auto data as is

HClust = AgglomerativeClustering
hc_comp = HClust(distance_threshold=0,
                 n_clusters=None,
                 linkage='complete')
hc_comp.fit(sample10)

In [None]:
# We can create a dendrogram using matplotlib

linkage_comp = compute_linkage(hc_comp)
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
dendrogram(linkage_comp,
           ax=ax,
           leaf_rotation=90,
           labels=sample10.index,
           color_threshold=-np.inf,
           above_threshold_color='black');

In [None]:
# We can even add colors based on the selected clusters

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
dendrogram(linkage_comp,
           ax=ax,
           leaf_rotation=90,
           labels=sample10.index,
           color_threshold=1000, # this is the important part 
           above_threshold_color='black');

In [None]:
# We can cut the tree (by number of clusters or by height) to get an array of cluster assignments

cut_tree(linkage_comp, n_clusters=3).T
#cut_tree(linkage_comp, height=1000)

In [None]:
# Let's attach the cluster assignments to our data
# and sort by cluster number

sample10['cluster']=cut_tree(linkage_comp, n_clusters=3)
sample10.sort_values('cluster')

In [None]:
# Hmm, our clusters seem to be highly influenced by "weight"

sample10.corr().style.background_gradient(cmap='coolwarm')

In [None]:
# Remove the cluster assignment column so we can start over...

sample10.drop(columns='cluster', axis=1, inplace=True)

In [None]:
# Let's standardize the data before clustering

scaler = StandardScaler()
sample10_scale = scaler.fit_transform(sample10)
sample10_scale

In [None]:
# We'll run the algorithm again
# Can we update this block to add colors for 3 clusters?

hc_comp_scale = HClust(distance_threshold=0,
                       n_clusters=None,
                       linkage='complete').fit(sample10_scale)
linkage_comp_scale = compute_linkage(hc_comp_scale)
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
dendrogram(linkage_comp_scale, ax=ax, 
           leaf_rotation=90, labels=sample10.index,
           color_threshold=-np.inf, above_threshold_color='black')
ax.set_title("Hierarchical Clustering with Scaled Features");

In [None]:
# Let's compare the clusters from before and after standardizing

sample10['cluster'] = cut_tree(linkage_comp, n_clusters=3)
sample10['cluster_scale'] = cut_tree(linkage_comp_scale, n_clusters=3)
sample10.sort_values('cluster')

In [None]:
# Other types of linkage

hc_sing_scale = HClust(distance_threshold=0,
                       n_clusters=None,
                       linkage='single').fit(sample10_scale)
linkage_sing_scale = compute_linkage(hc_sing_scale)

hc_avg_scale = HClust(distance_threshold=0,
                      n_clusters=None,
                      linkage='average').fit(sample10_scale)
linkage_avg_scale = compute_linkage(hc_avg_scale)

In [None]:
fig, axes = plt.subplots(nrows=1,ncols=3,figsize=(15, 5))

dendrogram(linkage_comp_scale, ax=axes[0], 
           leaf_rotation=90, labels=sample10.index,
           color_threshold=5, above_threshold_color='black')
dendrogram(linkage_sing_scale, ax=axes[1], 
           leaf_rotation=90, labels=sample10.index,
           color_threshold=2.6, above_threshold_color='black')
dendrogram(linkage_avg_scale, ax=axes[2], 
           leaf_rotation=90, labels=sample10.index,
           color_threshold=3.5, above_threshold_color='black')
ax.set_title("Hierarchical Clustering with Scaled Features");

In [None]:
# We're done with the hierarchical cluster assignment columns, so we'll drop them before moving on

sample10.drop(columns=['cluster','cluster_scale'], axis=1, inplace=True)

## $K$-Means Clustering

In [None]:
# Run a single K-means with K=3

kmeans = KMeans(n_clusters=3,
                random_state=1,
                n_init=1).fit(sample10_scale)

In [None]:
# Get the cluster assignments 

cluster_km = pd.DataFrame(kmeans.labels_)
cluster_km.index=sample10.index
cluster_km

In [None]:
# Add the cluster assignments to the data

sample10['cluster_km']=cluster_km
sample10.sort_values('cluster_km')

In [None]:
# Even though the data is 8-dimensional, we can plot pairs at a time

fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.scatter(sample10['mpg'], sample10['weight'], c=sample10['cluster_km'])
ax.set_title("K-Means Clustering Results with K=3")
ax.set_xlabel('mpg')
ax.set_ylabel('weight');

In [None]:
# What happens if we change the initial randomization?

kmeans2 = KMeans(n_clusters=3,
                random_state=2,
                n_init=1).fit(sample10_scale)
cluster_km2 = pd.DataFrame(kmeans2.labels_)
cluster_km2.index=sample10.index
cluster_km2

In [None]:
# That wasn't helpful at all...let's append that column of cluster assignments to the existing data

sample10['cluster_km2']=cluster_km2
sample10.sort_values('cluster_km2')

In [None]:
# Here is a plot of the new clusters produced with a different randomization. Totally different!

fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.scatter(sample10['mpg'], sample10['weight'], c=sample10['cluster_km2'])
ax.set_title("K-Means Clustering Results with K=3")
ax.set_xlabel('mpg')
ax.set_ylabel('weight');

In [None]:
# We can automatically have the algorithm repeat itself many times and then take the optimal answer...
# here we use 100 iterations.

kmeans3 = KMeans(n_clusters=3,
                random_state=1,
                n_init=100).fit(sample10_scale)
cluster_km3 = pd.DataFrame(kmeans3.labels_)
cluster_km3.index=sample10.index
cluster_km3

In [None]:
sample10['cluster_km3']=cluster_km3
sample10.sort_values('cluster_km3')

In [None]:
# Here is the plot for the 'optimal'

fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.scatter(sample10['mpg'], sample10['weight'], c=sample10['cluster_km3'])
ax.set_title("K-Means Clustering Results with K=3")
ax.set_xlabel('mpg')
ax.set_ylabel('weight');

In [None]:
# we can plot by different variables, too
# can you find a pair of variables that show a meaningful difference between the clusters?

fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.scatter(sample10['year'], sample10['origin'], c=sample10['cluster_km3'])
ax.set_title("K-Means Clustering Results with K=3");
ax.set_xlabel('year')
ax.set_ylabel('origin');

In [None]:
# We can create a function to run k-means and make these plots automatically

def kmeans_plot(data=sample10,n_clusters=3,var1='mpg',var2='weight'):
    data_scale = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=1,
                    n_init=100).fit(data_scale)
    cluster_km = pd.DataFrame(kmeans.labels_)
    cluster_km.index=data.index
    fig, ax = plt.subplots(1, 1, figsize=(8,8))
    ax.scatter(data[var1], data[var2], c=cluster_km)
    ax.set_title("K-Means Clustering Results with K={}".format(n_clusters))
    ax.set_xlabel(var1)
    ax.set_ylabel(var2);

In [None]:
# can you find a pair of variables that are important when you have 5 different clusters?

kmeans_plot(data=Auto,
            n_clusters=4,
            var1='displacement',
            var2='mpg')

In [None]:
kmeans_plot(data=Auto,
            n_clusters=5,
            var1='horsepower',
            var2='acceleration')