In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import cdist

def apply_dbscan(embeddings, eps=0.5, min_samples=5):
    return DBSCAN(eps=eps, min_samples=min_samples).fit_predict(embeddings)

def apply_kmeans(embeddings, n_clusters):
    return KMeans(n_clusters=n_clusters, random_state=42).fit_predict(embeddings)

def apply_agglomerative(embeddings, n_clusters):
    return AgglomerativeClustering(n_clusters=n_clusters).fit_predict(embeddings)

def compute_metrics(embeddings, labels):
    metrics = {
        "silhouette_score": silhouette_score(embeddings, labels),
        "calinski_harabasz_score": calinski_harabasz_score(embeddings, labels),
        "davies_bouldin_score": davies_bouldin_score(embeddings, labels)
    }
    
    # Compute intra-cluster distances
    unique_labels = np.unique(labels)
    intra_cluster_distances = []
    for label in unique_labels:
        cluster_points = embeddings[labels == label]
        if len(cluster_points) > 1:
            distances = cdist(cluster_points, cluster_points)
            intra_cluster_distances.extend(distances[np.triu_indices(len(distances), k=1)])
    
    metrics["avg_intra_cluster_distance"] = np.mean(intra_cluster_distances)
    metrics["max_intra_cluster_distance"] = np.max(intra_cluster_distances)
    
    # Compute inter-cluster distances
    centroids = np.array([embeddings[labels == label].mean(axis=0) for label in unique_labels])
    inter_cluster_distances = cdist(centroids, centroids)
    metrics["min_inter_cluster_distance"] = np.min(inter_cluster_distances[np.triu_indices(len(inter_cluster_distances), k=1)])
    
    return metrics

def analyze_subclusters(embeddings, labels, true_labels):
    unique_true_labels = np.unique(true_labels)
    subcluster_analysis = {}
    
    for true_label in unique_true_labels:
        mask = true_labels == true_label
        sub_embeddings = embeddings[mask]
        sub_labels = labels[mask]
        
        if len(np.unique(sub_labels)) > 1:
            sub_metrics = compute_metrics(sub_embeddings, sub_labels)
            subcluster_analysis[true_label] = sub_metrics
    
    return subcluster_analysis

def run_clustering(df, algorithms):
    embeddings = np.stack(df["embedding"].to_numpy())
    true_labels = df["label"].to_numpy()
    results = {}
    
    for name, func, params in algorithms:
        if name == "k-means":
            # Grid search for optimal k
            param_grid = {"n_clusters": range(2, 21)}  # Adjust range as needed
            grid_search = GridSearchCV(KMeans(random_state=42), param_grid, cv=5, scoring="silhouette_score")
            grid_search.fit(embeddings)
            best_k = grid_search.best_params_["n_clusters"]
            labels = func(embeddings, best_k)
        else:
            labels = func(embeddings, **params)
        
        metrics = compute_metrics(embeddings, labels)
        subcluster_analysis = analyze_subclusters(embeddings, labels, true_labels)
        
        results[name] = {
            "metrics": metrics,
            "subcluster_analysis": subcluster_analysis
        }
    
    return results

# Usage
df = pd.read_pickle("spac.pkl")  # Load your DataFrame

algorithms = [
    ("DBSCAN", apply_dbscan, {"eps": 0.5, "min_samples": 5}),
    ("k-means", apply_kmeans, {}),  # params will be determined by grid search
    ("Agglomerative", apply_agglomerative, {"n_clusters": 10})  # Adjust n_clusters as needed
]

results = run_clustering(df, algorithms)

# Print or save results
for algorithm, result in results.items():
    print(f"\nResults for {algorithm}:")
    print("Overall Metrics:")
    for metric, value in result["metrics"].items():
        print(f"  {metric}: {value}")
    print("\nSubcluster Analysis:")
    for true_label, sub_metrics in result["subcluster_analysis"].items():
        print(f"  True Label {true_label}:")
        for metric, value in sub_metrics.items():
            print(f"    {metric}: {value}")
