In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# from sklearn.preprocessing import StandardScaler
# import matplotlib.pyplot as plt
# from tqdm import tqdm


# def find_optimal_clusters(embeddings, max_clusters=30):
#     """
#     Find the optimal number of clusters using the elbow method and silhouette analysis.

#     Args:
#     embeddings (np.array): The embedding vectors
#     max_clusters (int): The maximum number of clusters to consider

#     Returns:
#     int: The optimal number of clusters
#     """
#     scaler = StandardScaler()
#     scaled_embeddings = scaler.fit_transform(embeddings)

#     inertias = []
#     silhouette_scores = []

#     for k in tqdm(range(2, max_clusters + 1), desc="Finding optimal clusters", total=max_clusters + 1):
#         kmeans = KMeans(n_clusters=k, random_state=42)
#         kmeans.fit(scaled_embeddings)
#         inertias.append(kmeans.inertia_)
#         silhouette_scores.append(silhouette_score(scaled_embeddings, kmeans.labels_))

#     # Plot elbow curve
#     plt.figure(figsize=(12, 5))
#     plt.subplot(1, 2, 1)
#     plt.plot(range(2, max_clusters + 1), inertias, marker="o")
#     plt.xlabel("Number of clusters")
#     plt.ylabel("Inertia")
#     plt.title("Elbow Method")

#     # Plot silhouette scores
#     plt.subplot(1, 2, 2)
#     plt.plot(range(2, max_clusters + 1), silhouette_scores, marker="o")
#     plt.xlabel("Number of clusters")
#     plt.ylabel("Silhouette Score")
#     plt.title("Silhouette Analysis")

#     plt.tight_layout()
#     plt.show()

#     # Find the optimal number of clusters
#     optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
#     print(f"Optimal number of clusters: {optimal_clusters}")

#     return optimal_clusters


# def cluster_embeddings(embeddings, n_clusters):
#     """
#     Perform K-means clustering on the embeddings.

#     Args:
#     embeddings (np.array): The embedding vectors
#     n_clusters (int): The number of clusters to use

#     Returns:
#     np.array: The cluster labels for each embedding
#     """
#     scaler = StandardScaler()
#     scaled_embeddings = scaler.fit_transform(embeddings)

#     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
#     labels = kmeans.fit_predict(scaled_embeddings)

#     return labels


# def analyze_clusters(df, embeddings, labels):
#     """
#     Analyze the resulting clusters.

#     Args:
#     df (pd.DataFrame): The original dataframe
#     embeddings (np.array): The embedding vectors
#     labels (np.array): The cluster labels

#     Returns:
#     None
#     """
#     df["cluster"] = labels

#     print("\nCluster Analysis:")
#     for cluster in range(max(labels) + 1):
#         cluster_df = df[df["cluster"] == cluster]
#         print(f"\nCluster {cluster}:")
#         print(f"  Size: {len(cluster_df)}")
#         print("  Top 5 most common original labels:")
#         print(cluster_df["label"].value_counts().head().to_string())

#     # Visualize clusters in 2D (you may want to use t-SNE or UMAP for high-dimensional data)
#     from sklearn.decomposition import PCA

#     pca = PCA(n_components=2)
#     embeddings_2d = pca.fit_transform(embeddings)

#     plt.figure(figsize=(12, 8))
#     scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap="viridis")
#     plt.colorbar(scatter)
#     plt.title("Cluster Visualization (PCA)")
#     plt.xlabel("First Principal Component")
#     plt.ylabel("Second Principal Component")
#     plt.show()


# # Main execution
# df = pd.read_pickle("merged.pkl")
# df = df[(df["model"] == "ViT-Finetuned") & (df["dataset"] == "SPAC")]
# df = df.reset_index(drop=True)

# embeddings = np.stack(df["embedding"].to_numpy())

# # Find optimal number of clusters
# optimal_clusters = find_optimal_clusters(embeddings, max_clusters=300)

# # Perform clustering
# labels = cluster_embeddings(embeddings, optimal_clusters)

# # Analyze clusters
# analyze_clusters(df, embeddings, labels)

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
import numpy as np
from sklearn.calibration import LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MiniBatchKMeans
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from tqdm import tqdm
from gorillatracker.classification.metrics import analyse_embedding_space, formatted_names
from scipy.optimize import linear_sum_assignment


def calculate_metrics(embeddings, labels, true_labels):
    """wraps analyse_embedding_space and adds class-weighted F1 score and precision"""
    assert len(labels) == len(true_labels) == len(embeddings)
    df = pd.DataFrame({"embedding": embeddings.tolist(), "label": labels.tolist()})
    metrics = analyse_embedding_space(df)

    # "label matching problem" in clustering evaluation
    matched_labels = match_labels(true_labels, labels)

    # Compute class-weighted F1 score
    f1 = f1_score(true_labels, matched_labels, average="weighted")

    # Compute class-weighted precision
    precision = precision_score(true_labels, matched_labels, average="weighted")

    metrics.update({"weighted_f1_score": f1, "weighted_precision": precision})
    return metrics


def match_labels(true_labels, predicted_labels):
    """
    Match predicted cluster labels to true labels using the Hungarian algorithm.

    NOTE(liamvdv): Necessary because cluster labels are arbitrary and may not match the true labels but represent the same clusters.
    """
    assert len(true_labels) == len(predicted_labels)

    # Create confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    # Use the Hungarian algorithm to find the best matching
    row_ind, col_ind = linear_sum_assignment(-cm)

    label_mapping = {pred: true for pred, true in zip(col_ind, row_ind)}

    matched_labels = np.array([label_mapping.get(label, label) for label in predicted_labels])

    return matched_labels


def _run_kmeans(args):
    scaled_embeddings, k, true_labels = args
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_embeddings)
    metric = calculate_metrics(scaled_embeddings, labels, true_labels)
    metric["algorithm"] = "K-means"
    metric["algorithm_arg"] = k
    return metric


def find_optimal_kmeans_parallel(embeddings, true_labels, max_clusters=30):
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # Prepare arguments for parallel processing
    args_list = [(scaled_embeddings, k, true_labels) for k in range(2, max_clusters + 1)]

    metrics = []

    with ProcessPoolExecutor(max_workers=10) as executor:
        # Submit all jobs
        future_to_k = {executor.submit(_run_kmeans, args): args[1] for args in args_list}

        # Collect results as they complete
        for future in tqdm(as_completed(future_to_k), total=len(future_to_k), desc="Running K-means"):
            k = future_to_k[future]
            try:
                metric = future.result()
                metrics.append(metric)
            except Exception as exc:
                print(f"K-means for k={k} generated an exception: {exc}")

    # Sort metrics by number of clusters
    metrics.sort(key=lambda x: x["algorithm_arg"])

    return metrics


def find_optimal_kmeans(embeddings, true_labels, max_clusters=30):
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    metrics = []

    for k in tqdm(range(2, max_clusters + 1), desc="Running K-means", total=max_clusters):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(scaled_embeddings)
        metric = calculate_metrics(scaled_embeddings, labels, true_labels)
        metric["algorithm"] = "K-means"
        metric["algorithm_arg"] = k
        metrics.append(metric)

    return metrics


def find_optimal_eps_for_dbscan(embeddings):
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # Find optimal eps
    """
    The purpose of this code is to create what's called a "k-distance graph". This graph, when plotted, shows the distance to the nearest neighbor for each point, sorted in ascending order. It's used to help determine a good value for the 'eps' parameter in DBSCAN.
    
    In a k-distance graph, you typically look for an "elbow" - a point where the distance starts increasing more rapidly. This elbow often indicates a good value for 'eps'. Points before the elbow are considered "close" to their neighbors and might form clusters, while points after the elbow are farther from their neighbors and might be considered noise or outliers.
    """
    neighbors = NearestNeighbors(n_neighbors=2)
    neighbors_fit = neighbors.fit(scaled_embeddings)
    distances, indices = neighbors_fit.kneighbors(scaled_embeddings)
    distances = np.sort(distances, axis=0)
    distances = distances[:, 1]

    plt.figure(figsize=(10, 5))
    plt.plot(distances)
    plt.xlabel("Points")
    plt.ylabel("Distance")
    plt.title("K-distance Graph")
    plt.show()

    print("Please examine the K-distance graph and input the 'elbow' point for eps:")
    eps = float(input("Enter the eps value: "))

    return eps


def find_optimal_dbscan(embeddings, true_labels, eps: float):
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # Find optimal min_samples
    metrics = []
    min_samples_range = range(2, 11)

    for min_samples in min_samples_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(scaled_embeddings)
        metric = calculate_metrics(scaled_embeddings, labels, true_labels)
        metric["algorithm"] = "Agglomerative Clustering"
        metric["algorithm_arg"] = min_samples
        metrics.append(metric)

    return metrics


def find_optimal_agglomerative(embeddings, true_labels, max_clusters=30):
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # TODO(liamvdv): unsure if this is the correct way to cache the dendrogram

    # Initialize AgglomerativeClustering with max_clusters and compute_full_tree=True
    agg_clustering = AgglomerativeClustering(n_clusters=max_clusters, compute_full_tree=True, linkage="ward")

    # Fit the model to compute the full tree
    agg_clustering.fit(scaled_embeddings)

    metrics = []

    # Iterate in reverse order from max_clusters to 2
    for k in range(max_clusters, 1, -1):
        # Extract labels for k clusters
        labels = agg_clustering.labels_

        metric = calculate_metrics(scaled_embeddings, labels, true_labels)
        metric["algorithm"] = "Agglomerative Clustering"
        metric["algorithm_arg"] = k
        metrics.append(metric)

        # If not at the last iteration, update the number of clusters
        if k > 2:
            agg_clustering.n_clusters = k - 1
            agg_clustering.labels_ = agg_clustering.labels_[agg_clustering.n_clusters_ - 1]

    # Reverse the metrics list to have it in ascending order of clusters
    metrics.reverse()

    return metrics


# agg_metrics = find_optimal_agglomerative(embeddings, true_labels, max_clusters=30)
# eps = find_optimal_eps_for_dbscan(embeddings)
# dbscan_metrics = find_optimal_dbscan(embeddings, true_labels, eps)

In [None]:
import matplotlib.pyplot as plt
import math


def visualize_alg_metrics(in_metrics, formatted_names):
    """
    Create a grid of charts where every metric is shown for in_metrics.

    Parameters:
    in_metrics (list): List of dictionaries containing metrics for each run
    formatted_names (dict): Dictionary mapping metric names to formatted display names

    Returns:
    None (displays the plot)
    """
    alg = in_metrics[0]["algorithm"]
    # Get the list of metrics (excluding 'algorithm' and 'algorithm_arg')
    metrics = [key for key in in_metrics[0].keys() if key not in ["algorithm", "algorithm_arg"]]

    # Calculate the grid dimensions
    n_metrics = len(in_metrics)
    n_cols = 3  # You can adjust this for a different layout
    n_rows = math.ceil(n_metrics / n_cols)

    # Create the plot
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 5 * n_rows))
    fig.suptitle(f"{alg} Clustering Metrics", fontsize=16)

    # Flatten the axs array for easier indexing
    axs = axs.flatten()

    # Plot each metric
    for i, metric in enumerate(metrics):
        x = [m["algorithm_arg"] for m in in_metrics]
        y = [m[metric] for m in in_metrics]

        axs[i].plot(x, y, marker="o")
        axs[i].set_title(formatted_names.get(metric, metric))
        axs[i].set_xlabel("Number of Clusters")
        axs[i].set_ylabel("Value")
        axs[i].grid(True)

    # Remove any unused subplots
    for j in range(i + 1, len(axs)):
        fig.delaxes(axs[j])

    plt.tight_layout()
    plt.show()

In [None]:
df = pd.read_pickle("merged.pkl")
df = df[(df["model"] == "ViT-Finetuned") & (df["dataset"] == "SPAC")]
df = df.reset_index(drop=True)

embeddings = np.stack(df["embedding"].to_numpy())
true_labels = df["label"].to_numpy()

# Find optimal number of clusters for K-means
kmeans_metrics = find_optimal_kmeans(embeddings, true_labels, max_clusters=200)
visualize_alg_metrics(kmeans_metrics, formatted_names)

In [None]:

def with_min_label_count(df: pd.DataFrame, min: int) -> pd.DataFrame:
    """
    Create a copy of the DataFrame, keeping only rows where the label appears
    at least 'min' times in the original DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame. Must have a 'label' column.
    min (int): Minimum number of occurrences for a label to be included.

    Returns:
    pd.DataFrame: A new DataFrame with filtered rows.

    Raises:
    ValueError: If 'label' column is not present in the DataFrame.
    """
    if 'label' not in df.columns:
        raise ValueError("DataFrame must have a 'label' column")

    # Count label occurrences
    label_counts = df['label'].value_counts()

    # Get labels that appear at least 'min' times
    valid_labels = label_counts[label_counts >= min].index

    # Create a new DataFrame with only the valid labels
    filtered_df = df[df['label'].isin(valid_labels)].copy()

    return filtered_df

m3df = with_min_label_count(df, 3)
m3embeddings = np.stack(m3df["embedding"].to_numpy())
m3true_labels = m3df["label"].to_numpy()

m3kmeans_metrics = find_optimal_kmeans(m3embeddings, m3true_labels, max_clusters=200)
visualize_alg_metrics(m3kmeans_metrics, formatted_names)