# Clustering with Dimensionality Reduction

In [None]:
We reduce data with PCA/UMAP, then cluster using KMeans and HDBSCAN.

In [None]:
import pandas as pd

In [None]:
from sklearn.datasets import load_digits

In [None]:
from src.dimensionality_reduction import run_pca, run_umap

In [None]:
from src.clustering import run_kmeans, run_hdbscan

In [None]:
from src.evaluation import evaluate_clustering

In [None]:
from src.visualization import plot_embedding, plot_clusters

# Load dataset

In [None]:
X, y = load_digits(return_X_y=True)

# Reduce with UMAP

In [None]:
X_umap = run_umap(X, n_components=2)

In [None]:
plot_embedding(X_umap, labels=y, title='UMAP embedding with true labels').show()

# Apply clustering

In [None]:
labels_kmeans, _ = run_kmeans(X_umap, n_clusters=10)

In [None]:
labels_hdbscan = run_hdbscan(X_umap, min_cluster_size=30)

# Evaluate

In [None]:
print('KMeans:', evaluate_clustering(X_umap, labels_kmeans))

In [None]:
print('HDBSCAN:', evaluate_clustering(X_umap, labels_hdbscan))

# Visualize clusters

In [None]:
plot_clusters(X_umap, labels_kmeans, title='KMeans on UMAP').show()

In [None]:
plot_clusters(X_umap, labels_hdbscan, title='HDBSCAN on UMAP').show()