In [6]:
import pandas as pd


def format_value(value):
    if pd.isna(value) or value == "nan":
        return "nan"
    elif isinstance(value, (int, float)):
        return f"{value:.3f}"
    else:
        return str(value)


def generate_single_embedding_space_graphic(df, dataset, model):
    df = df.copy()

    # First, filter the DataFrame
    spac_max3_vit_df = df[(df["dataset"] == dataset) & (df["model"] == model)]

    # List to store the summary of metrics
    metrics_summary = []

    # List of algorithms to check
    algorithms_to_check = spac_max3_vit_df["algorithm"].unique()

    # Iterate over each algorithm
    for algorithm in algorithms_to_check:
        # Filter DataFrame for the current algorithm
        algorithm_df = spac_max3_vit_df[spac_max3_vit_df["algorithm"] == algorithm]

        # Get the rows with max silhouette score, max dunn index, min davies-bouldin index, and max calinski-harabasz index
        max_silhouette_row = algorithm_df.loc[algorithm_df["silhouette_coefficient"].idxmax()]
        max_dunn_row = algorithm_df.loc[algorithm_df["dunn_index"].idxmax()]
        min_davies_bouldin_row = algorithm_df.loc[algorithm_df["davies_bouldin_index"].idxmin()]
        max_calinski_harabasz_row = algorithm_df.loc[algorithm_df["calinski_harabasz_index"].idxmax()]

        # Add rows to the metrics summary
        metrics_summary.extend(
            [
                [
                    "Silhouette Score (max)",
                    algorithm,
                    max_silhouette_row["adjusted_rand_score"],
                    max_silhouette_row["homogeneity_score"],
                    max_silhouette_row["completeness_score"],
                    max_silhouette_row["v_measure_score"],
                    max_silhouette_row["n_clusters"],
                ],
                [
                    "Dunn Index (max)",
                    algorithm,
                    max_dunn_row["adjusted_rand_score"],
                    max_dunn_row["homogeneity_score"],
                    max_dunn_row["completeness_score"],
                    max_dunn_row["v_measure_score"],
                    max_dunn_row["n_clusters"],
                ],
                [
                    "Davies-Bouldin Index (min)",
                    algorithm,
                    min_davies_bouldin_row["adjusted_rand_score"],
                    min_davies_bouldin_row["homogeneity_score"],
                    min_davies_bouldin_row["completeness_score"],
                    min_davies_bouldin_row["v_measure_score"],
                    min_davies_bouldin_row["n_clusters"],
                ],
                [
                    "Calinski-Harabasz Index (max)",
                    algorithm,
                    max_calinski_harabasz_row["adjusted_rand_score"],
                    max_calinski_harabasz_row["homogeneity_score"],
                    max_calinski_harabasz_row["completeness_score"],
                    max_calinski_harabasz_row["v_measure_score"],
                    max_calinski_harabasz_row["n_clusters"],
                ],
            ]
        )

    # Convert to DataFrame for easy display
    return pd.DataFrame(
        metrics_summary,
        columns=[
            "Approach",
            "Algorithm",
            "Rand Score",
            "Homogeneity",
            "Completeness",
            "V-Measure",
            "n_clusters",
        ],
    )


def generate_latex_table(df, dataset, model, ref="tab:clustering-metrics-spac", is_cosine=False):
    latex_code = [
        r"\begin{table}[H]",
        r"    \centering",
        r"    \resizebox{\textwidth}{!}{%",
        r"    \begin{tabular}{llccccc}",
        r"    \toprule",
        r"    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\",
        r"     \cmidrule(lr){3-6}",
        r"    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\",
        r"    \midrule",
    ]

    # Filter out "HDBSCAN" rows to append later
    hdbscan_rows = df[df["Algorithm"] == "HDBSCAN"]
    df = df[df["Algorithm"] != "HDBSCAN"]

    for approach in df["Approach"].unique():
        latex_code.append(f"    \multicolumn{{7}}{{l}}{{\\textbf{{{approach}}}}} \\\\")
        group = df[df["Approach"] == approach]
        for _, row in group.iterrows():
            formatted_values = [
                format_value(row["Rand Score"]),
                format_value(row["Homogeneity"]),
                format_value(row["Completeness"]),
                format_value(row["V-Measure"]),
                str(row["n_clusters"]),
            ]
            latex_code.append(f"    & {row['Algorithm']} & {' & '.join(formatted_values)} \\\\")
        if approach != df["Approach"].unique()[-1]:
            latex_code.append(r"    \midrule")

    # Add the "HDBSCAN" row at the end with multicolumn "Any"
    if not hdbscan_rows.empty:
        latex_code.append(r"    \midrule")
        latex_code.append(r"    \multicolumn{7}{l}{\textbf{Any}} \\")
        for _, row in hdbscan_rows[:1].iterrows():
            formatted_values = [
                format_value(row["Rand Score"]),
                format_value(row["Homogeneity"]),
                format_value(row["Completeness"]),
                format_value(row["V-Measure"]),
                str(row["n_clusters"]),
            ]
            latex_code.append(f"    & HDBSCAN & {' & '.join(formatted_values)} \\\\")

    latex_code.extend(
        [
            r"    \bottomrule",
            r"    \end{tabular}%",
            r"    }",
            (
                r"    \caption{Comparison of Clustering Approaches and Algorithms across Various Metrics. Dataset is "
                + dataset
                + " and Model is "
                + model
                + " (Cosine Similarity)"
                if is_cosine
                else " (Euclidean Distance)" + "}"
            ),
            r"    \label{" + ref + "}",
            r"\end{table}",
        ]
    )

    return "\n".join(latex_code)

In [7]:
def generate(
    ref: str = "tab:clustering-metrics-spac",
    is_cosine: bool = False,
    dataset: str = "SPAC+min3",
    model: str = "ViT-Finetuned",
):
    metrics_df = pd.read_pickle(
        f"/workspaces/gorillatracker/sep29_clustering_results{'_cosine' if is_cosine else '_euclidean'}.pkl"
    )
    metrics_summary_df = generate_single_embedding_space_graphic(metrics_df, dataset, model)
    print(generate_latex_table(metrics_summary_df, dataset, model, ref, is_cosine=True))


generate("tab:clustering-metrics-spac-euclidean", is_cosine=False, dataset="SPAC+min3", model="ViT-Finetuned")

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccccc}
    \toprule
    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\
     \cmidrule(lr){3-6}
    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\
    \midrule
    \multicolumn{7}{l}{\textbf{Silhouette Score (max)}} \\
    & KMeans & 0.221 & 0.970 & 0.706 & 0.817 & 453 \\
    & AgglomerativeClustering & 0.221 & 0.987 & 0.710 & 0.826 & 463 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Dunn Index (max)}} \\
    & KMeans & 0.132 & 0.995 & 0.668 & 0.799 & 698 \\
    & AgglomerativeClustering & 0.119 & 0.999 & 0.666 & 0.799 & 703 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Davies-Bouldin Index (min)}} \\
    & KMeans & 0.127 & 0.997 & 0.665 & 0.798 & 718 \\
    & AgglomerativeClustering & 0.113 & 0.999 & 0.663 & 0.797 & 718 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Calinski-Harabasz Index (max)}} \\
    & KMeans & 0.047 & 0.161 & 0.629 & 0.256 & 3 \\
    & Agg

In [8]:
generate("tab:clustering-metrics-spac-cosine", is_cosine=True, dataset="SPAC+min3", model="ViT-Finetuned")

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccccc}
    \toprule
    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\
     \cmidrule(lr){3-6}
    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\
    \midrule
    \multicolumn{7}{l}{\textbf{Silhouette Score (max)}} \\
    & KMeans & 0.148 & 0.981 & 0.676 & 0.800 & 583 \\
    & AgglomerativeClustering & 0.390 & 0.994 & 0.743 & 0.850 & 463 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Dunn Index (max)}} \\
    & KMeans & 0.117 & 0.989 & 0.661 & 0.793 & 698 \\
    & AgglomerativeClustering & 0.196 & 1.000 & 0.681 & 0.810 & 693 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Davies-Bouldin Index (min)}} \\
    & KMeans & 0.111 & 0.991 & 0.659 & 0.792 & 718 \\
    & AgglomerativeClustering & 0.188 & 1.000 & 0.677 & 0.807 & 718 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Calinski-Harabasz Index (max)}} \\
    & KMeans & 0.040 & 0.156 & 0.613 & 0.249 & 3 \\
    & Agg

In [9]:
generate("tab:clustering-metrics-bristol-cosine", is_cosine=True, dataset="Bristol", model="ViT-Finetuned")

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccccc}
    \toprule
    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\
     \cmidrule(lr){3-6}
    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\
    \midrule
    \multicolumn{7}{l}{\textbf{Silhouette Score (max)}} \\
    & KMeans & 0.027 & 0.929 & 0.333 & 0.490 & 210 \\
    & AgglomerativeClustering & 0.035 & 0.064 & 0.240 & 0.101 & 3 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Dunn Index (max)}} \\
    & KMeans & 0.003 & 1.000 & 0.318 & 0.483 & 355 \\
    & AgglomerativeClustering & 0.001 & 1.000 & 0.316 & 0.481 & 365 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Davies-Bouldin Index (min)}} \\
    & KMeans & 0.000 & 1.000 & 0.315 & 0.480 & 370 \\
    & AgglomerativeClustering & 0.000 & 1.000 & 0.315 & 0.480 & 370 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Calinski-Harabasz Index (max)}} \\
    & KMeans & 0.087 & 0.132 & 0.185 & 0.154 & 4 \\
    & Agglo

In [10]:
generate("tab:clustering-metrics-bristol-euclidean", is_cosine=False, dataset="Bristol", model="ViT-Finetuned")

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccccc}
    \toprule
    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\
     \cmidrule(lr){3-6}
    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\
    \midrule
    \multicolumn{7}{l}{\textbf{Silhouette Score (max)}} \\
    & KMeans & 0.031 & 0.981 & 0.346 & 0.511 & 235 \\
    & AgglomerativeClustering & 0.041 & 0.966 & 0.354 & 0.518 & 190 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Dunn Index (max)}} \\
    & KMeans & 0.002 & 1.000 & 0.317 & 0.482 & 360 \\
    & AgglomerativeClustering & 0.001 & 1.000 & 0.316 & 0.481 & 365 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Davies-Bouldin Index (min)}} \\
    & KMeans & 0.000 & 1.000 & 0.315 & 0.480 & 370 \\
    & AgglomerativeClustering & 0.000 & 1.000 & 0.315 & 0.480 & 370 \\
    \midrule
    \multicolumn{7}{l}{\textbf{Calinski-Harabasz Index (max)}} \\
    & KMeans & 0.071 & 0.102 & 0.178 & 0.129 & 3 \\
    & Agg