In [None]:
import pandas as pd

metrics_df = pd.read_pickle("/workspaces/gorillatracker/sep26_clustering_results.pkl")

In [None]:
def format_value(value):
    if pd.isna(value) or value == "nan":
        return "nan"
    elif isinstance(value, (int, float)):
        return f"{value:.3f}"
    else:
        return str(value)


def generate_single_embedding_space_graphic(df, dataset, model):
    df = df.copy()

    # First, filter the DataFrame
    spac_max3_vit_df = df[(df["dataset"] == dataset) & (df["model"] == model)]

    # List to store the summary of metrics
    metrics_summary = []

    # List of algorithms to check
    algorithms_to_check = spac_max3_vit_df["algorithm"].unique()

    # Iterate over each algorithm
    for algorithm in algorithms_to_check:
        # Filter DataFrame for the current algorithm
        algorithm_df = spac_max3_vit_df[spac_max3_vit_df["algorithm"] == algorithm]

        # Get the rows with max silhouette score, max dunn index, min davies-bouldin index, and max calinski-harabasz index
        max_silhouette_row = algorithm_df.loc[algorithm_df["silhouette_coefficient"].idxmax()]
        max_dunn_row = algorithm_df.loc[algorithm_df["dunn_index"].idxmax()]
        min_davies_bouldin_row = algorithm_df.loc[algorithm_df["davies_bouldin_index"].idxmin()]
        max_calinski_harabasz_row = algorithm_df.loc[algorithm_df["calinski_harabasz_index"].idxmax()]

        # Add rows to the metrics summary
        metrics_summary.extend(
            [
                [
                    "Silhouette Score (max)",
                    algorithm,
                    max_silhouette_row["adjusted_rand_score"],
                    max_silhouette_row["homogeneity_score"],
                    max_silhouette_row["completeness_score"],
                    max_silhouette_row["v_measure_score"],
                    max_silhouette_row["n_clusters"],
                ],
                [
                    "Dunn Index (max)",
                    algorithm,
                    max_dunn_row["adjusted_rand_score"],
                    max_dunn_row["homogeneity_score"],
                    max_dunn_row["completeness_score"],
                    max_dunn_row["v_measure_score"],
                    max_dunn_row["n_clusters"],
                ],
                [
                    "Davies-Bouldin Index (min)",
                    algorithm,
                    min_davies_bouldin_row["adjusted_rand_score"],
                    min_davies_bouldin_row["homogeneity_score"],
                    min_davies_bouldin_row["completeness_score"],
                    min_davies_bouldin_row["v_measure_score"],
                    min_davies_bouldin_row["n_clusters"],
                ],
                [
                    "Calinski-Harabasz Index (max)",
                    algorithm,
                    max_calinski_harabasz_row["adjusted_rand_score"],
                    max_calinski_harabasz_row["homogeneity_score"],
                    max_calinski_harabasz_row["completeness_score"],
                    max_calinski_harabasz_row["v_measure_score"],
                    max_calinski_harabasz_row["n_clusters"],
                ],
            ]
        )

    # Convert to DataFrame for easy display
    return pd.DataFrame(
        metrics_summary,
        columns=[
            "Approach",
            "Algorithm",
            "Rand Score",
            "Homogeneity",
            "Completeness",
            "V-Measure",
            "n_clusters",
        ],
    )


def generate_latex_table(df, dataset, model, ref="tab:clustering-metrics"):
    latex_code = [
        r"\begin{table}[H]",
        r"    \centering",
        r"    \resizebox{\textwidth}{!}{%",
        r"    \begin{tabular}{llccccc}",
        r"    \toprule",
        r"    \multicolumn{2}{l}{Approaches} & \multicolumn{4}{c}{Metrics} \\",
        r"     \cmidrule(lr){3-6}",
        r"    & & ARI & Homogeneity & Completeness & V-Measure & Clusters Found \\",
        r"    \midrule",
    ]

    # Filter out "HDBSCAN" rows to append later
    hdbscan_rows = df[df["Algorithm"] == "HDBSCAN"]
    df = df[df["Algorithm"] != "HDBSCAN"]

    for approach in df["Approach"].unique():
        latex_code.append(f"    \multicolumn{{7}}{{l}}{{\\textbf{{{approach}}}}} \\\\")
        group = df[df["Approach"] == approach]
        for _, row in group.iterrows():
            formatted_values = [
                format_value(row["Rand Score"]),
                format_value(row["Homogeneity"]),
                format_value(row["Completeness"]),
                format_value(row["V-Measure"]),
                str(row["n_clusters"]),
            ]
            latex_code.append(f"    & {row['Algorithm']} & {' & '.join(formatted_values)} \\\\")
        if approach != df["Approach"].unique()[-1]:
            latex_code.append(r"    \midrule")

    # Add the "HDBSCAN" row at the end with multicolumn "Any"
    if not hdbscan_rows.empty:
        latex_code.append(r"    \midrule")
        latex_code.append(r"    \multicolumn{7}{l}{\textbf{Any}} \\")
        for _, row in hdbscan_rows[:1].iterrows():
            formatted_values = [
                format_value(row["Rand Score"]),
                format_value(row["Homogeneity"]),
                format_value(row["Completeness"]),
                format_value(row["V-Measure"]),
                str(row["n_clusters"]),
            ]
            latex_code.append(f"    & HDBSCAN & {' & '.join(formatted_values)} \\\\")

    latex_code.extend(
        [
            r"    \bottomrule",
            r"    \end{tabular}%",
            r"    }",
            r"    \caption{Comparison of Clustering Approaches and Algorithms across Various Metrics. Dataset is "
            + dataset
            + " and Model is "
            + model
            + "}",
            r"    \label{" + ref + "}",
            r"\end{table}",
        ]
    )

    return "\n".join(latex_code)


dataset = "SPAC+min3"
model = "ViT-Finetuned"
metrics_summary_df = generate_single_embedding_space_graphic(metrics_df, dataset, model)
print(generate_latex_table(metrics_summary_df, dataset, model, "tab:clustering-metrics-spac"))

In [None]:
dataset = "Bristol"
model = "ViT-Finetuned"
metrics_summary_df = generate_single_embedding_space_graphic(metrics_df, dataset, model)
print(generate_latex_table(metrics_summary_df, dataset, model, "tab:clustering-metrics-bristol"))