# Standalone Clustering Metrics

This notebook loads precomputed cluster coordinates and labels, then computes a suite of external and internal clustering metrics for exploratory analysis and debugging.

In [18]:
### Parameters
vec_path = '/Users/jakedugan/Projects/corporate_media_risk/corp_speech_risk_dataset/data/clustering/concat_vectors.npy'  # Path to concatenated vectors
meta_path = '/Users/jakedugan/Projects/corporate_media_risk/corp_speech_risk_dataset/data/clustering/metadata.json'     # Path to metadata
supervision = 'categorical'                     # 'categorical' or 'continuous'
min_cluster_size = 50

In [19]:
from corp_speech_risk_dataset.clustering.pipeline import ClusterPipeline
import numpy as np

# Build pipeline and compute clusters & embeddings
pipe = ClusterPipeline(
    vec_path=vec_path,
    meta_path=meta_path,
    use_gpu=False,
    min_cluster_size=min_cluster_size,
    supervision_mode=supervision
)
pipe.build()
labels = pipe.cluster()
coords = pipe.reduce()
true_labels = pipe.buckets

Faiss build time: 0.1s




HDBSCAN clustering time: 35.5s


  warn(


UMAP reduction time for visualization: 40.5s


In [20]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.metrics import (
    silhouette_score, adjusted_rand_score,
    adjusted_mutual_info_score, normalized_mutual_info_score,
    calinski_harabasz_score, davies_bouldin_score,
    fowlkes_mallows_score
)
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import trustworthiness
from sklearn.metrics import (
    silhouette_score, adjusted_rand_score,
    adjusted_mutual_info_score, normalized_mutual_info_score,
    calinski_harabasz_score, davies_bouldin_score,
    fowlkes_mallows_score
)
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform
import numpy as np

# Purity
def purity_score(true, pred):
    contingency = pd.crosstab(pd.Series(true, name='true'), pd.Series(pred, name='pred'))
    return np.sum(np.max(contingency.values, axis=1)) / np.sum(contingency.values)

# Dunn Index
def dunn_index(X, labels):
    distances = squareform(pdist(X))
    delta = np.min([distances[i][j]
                    for i in range(len(X)) for j in range(len(X))
                    if labels[i] != labels[j]])
    big_delta = np.max([distances[i][j]
                        for i in range(len(X)) for j in range(len(X))
                        if labels[i] == labels[j]])
    return delta / big_delta

# Trustworthiness & Continuity
from sklearn.manifold import trustworthiness, _utils


In [21]:
# Compute metrics
metrics = {
    'Silhouette': silhouette_score(coords, true_labels),
    'ARI': adjusted_rand_score(true_labels, labels),
    'Purity': purity_score(true_labels, labels),
    'AMI': adjusted_mutual_info_score(true_labels, labels),
    'NMI': normalized_mutual_info_score(true_labels, labels),
    'Calinski-Harabasz': calinski_harabasz_score(coords, labels),
    'Davies-Bouldin': davies_bouldin_score(coords, labels),
    'Fowlkes-Mallows': fowlkes_mallows_score(true_labels, labels),
    'Dunn': dunn_index(coords, labels),
    # 'Trustworthiness': trustworthiness(true_labels, coords),
}

# Display results
pd.DataFrame.from_dict(metrics, orient='index', columns=['Value'])

Unnamed: 0,Value
Silhouette,0.213454
ARI,0.00735
Purity,0.550201
AMI,0.010333
NMI,0.010504
Calinski-Harabasz,4030.07525
Davies-Bouldin,3.472446
Fowlkes-Mallows,0.349469
Dunn,3e-06
