In [5]:
import pandas as pd


def extract_data_best_algorithms_can_do(df):
    # Group by Dataset, Algorithm, and Model
    grouped = df.groupby(["dataset", "algorithm", "model"])

    report_data = []

    for (dataset, algorithm, model), group in grouped:
        # Find the row with the top ARI (adjusted_rand_score)
        top_ari_row = group.loc[group["adjusted_rand_score"].idxmax()]

        # Get the true number of clusters (assuming it's constant for each dataset-algorithm-model combination)
        true_clusters = group["n_true_clusters"].iloc[0]

        # Find the row with n_clusters closest to true_clusters
        closest_clusters_row = group.loc[(group["n_clusters"] - true_clusters).abs().idxmin()]

        report_data.append(
            {
                "Dataset": dataset,
                "Algorithm": algorithm,
                "Model": model,
                "True Clusters": true_clusters,
                "ARI": top_ari_row["adjusted_rand_score"],
                "Clusters": top_ari_row["n_clusters"],
                "True ARI": closest_clusters_row["adjusted_rand_score"],
            }
        )

    return pd.DataFrame(report_data)


def format_value(value):
    if abs(value) < 1e-10:  # Consider values very close to zero as zero
        return "0"
    return f"{value:.3f}".rstrip("0").rstrip(".").replace("-0", "0")


def latex_table_best_algorithms_can_do(df, is_cosine=False):
    latex_code = [
        r"\begin{table}[H]",
        r"    \centering",
        r"    \resizebox{\textwidth}{!}{%",
        r"    \begin{tabular}{l|c|ccc|ccc|ccc}",
        r"    \toprule",
        r"    & True & \multicolumn{3}{c|}{KMeans} & \multicolumn{3}{c|}{AgglomerativeClustering} & \multicolumn{3}{c}{HDBSCAN} \\",
        r"    Embedding Space & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters \\",
        r"    \midrule",
    ]

    model_order = ["EfN-Pretrained", "EfN-Finetuned", "ViT-Pretrained", "ViT-Finetuned"]

    datasets = df["Dataset"].unique()
    for i, dataset in enumerate(datasets):
        group = df[df["Dataset"] == dataset]
        latex_code.append(f"    \\multicolumn{{11}}{{l}}{{\\textbf{{{dataset}}}}} \\\\")

        # Pivot the data
        pivot = group.pivot(index="Model", columns="Algorithm", values=["True Clusters", "ARI", "Clusters", "True ARI"])

        models = [model for model in model_order if model in pivot.index]
        for model in models:
            row = pivot.loc[model]
            true_clusters = row["True Clusters"].iloc[0]  # All algorithms have same True Clusters
            values = [
                model,
                str(int(true_clusters)),
            ]
            for alg in ["KMeans", "AgglomerativeClustering", "HDBSCAN"]:
                values.extend(
                    [format_value(row["True ARI"][alg]), format_value(row["ARI"][alg]), str(int(row["Clusters"][alg]))]
                )
            latex_code.append("    " + " & ".join(values) + r" \\")

        # Add a midrule between datasets, except after the last dataset
        if i < len(datasets) - 1:
            latex_code.append(r"    \midrule")

    latex_code.extend(
        [
            r"    \bottomrule",
            r"    \end{tabular}%",
            r"    }",
            r"    \caption{Clustering Performance Metrics for Different Models and Algorithms"
            + (r" (Cosine Similarity)" if is_cosine else r" (Euclidean Distance)")
            + "}",
            r"    \label{tab:clustering-algorithms" + ("-cosine" if is_cosine else "-euclidean") + r"}",
            r"\end{table}",
        ]
    )

    return "\n".join(latex_code)


def generate(is_cosine):
    metrics_df = pd.read_pickle(
        f"/workspaces/gorillatracker/sep29_clustering_results{'_cosine' if is_cosine else '_euclidean'}.pkl"
    )
    report_df = extract_data_best_algorithms_can_do(metrics_df)
    print(latex_table_best_algorithms_can_do(report_df, is_cosine=is_cosine))

In [6]:
generate(is_cosine=False)

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{l|c|ccc|ccc|ccc}
    \toprule
    & True & \multicolumn{3}{c|}{KMeans} & \multicolumn{3}{c|}{AgglomerativeClustering} & \multicolumn{3}{c}{HDBSCAN} \\
    Embedding Space & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters \\
    \midrule
    \multicolumn{11}{l}{\textbf{Bristol}} \\
    EfN-Pretrained & 7 & 0.094 & 0.112 & 12 & 0.128 & 0.134 & 19 & 0.071 & 0.071 & 2 \\
    EfN-Finetuned & 7 & 0.092 & 0.098 & 3 & 0.119 & 0.124 & 8 & 0.046 & 0.046 & 5 \\
    ViT-Pretrained & 7 & 0.105 & 0.174 & 9 & 0.17 & 0.19 & 25 & 0.079 & 0.079 & 2 \\
    ViT-Finetuned & 7 & 0.184 & 0.185 & 11 & 0.146 & 0.216 & 20 & 0.01 & 0.024 & 2 \\
    \midrule
    \multicolumn{11}{l}{\textbf{Bristol+min25+max25}} \\
    EfN-Pretrained & 7 & 0.175 & 0.182 & 8 & 0.159 & 0.187 & 12 & 0.077 & 0.077 & 2 \\
    EfN-Finetuned & 7 & 0.124 & 0.124 & 7 & 0.118 & 0.137 & 17 & 0.026 & 0.048 & 2 \\
  

In [7]:
generate(is_cosine=True)

\begin{table}[H]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{l|c|ccc|ccc|ccc}
    \toprule
    & True & \multicolumn{3}{c|}{KMeans} & \multicolumn{3}{c|}{AgglomerativeClustering} & \multicolumn{3}{c}{HDBSCAN} \\
    Embedding Space & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters \\
    \midrule
    \multicolumn{11}{l}{\textbf{Bristol}} \\
    EfN-Pretrained & 7 & 0.072 & 0.099 & 5 & 0.002 & 0.132 & 105 & 0.061 & 0.061 & 2 \\
    EfN-Finetuned & 7 & 0.105 & 0.126 & 3 & 0.025 & 0.086 & 150 & 0.017 & 0.017 & 6 \\
    ViT-Pretrained & 7 & 0.066 & 0.085 & 24 & 0.007 & 0.126 & 105 & 0.079 & 0.079 & 2 \\
    ViT-Finetuned & 7 & 0.12 & 0.128 & 12 & 0.04 & 0.146 & 74 & 0.011 & 0.063 & 2 \\
    \midrule
    \multicolumn{11}{l}{\textbf{Bristol+min25+max25}} \\
    EfN-Pretrained & 7 & 0.11 & 0.125 & 17 & 0.013 & 0.171 & 69 & 0.07 & 0.07 & 3 \\
    EfN-Finetuned & 7 & 0.102 & 0.102 & 7 & 0.029 & 0.11 & 65 & 0.017 & 0.021 & 3 \\
