In [None]:
import pandas as pd

metrics_df = pd.read_pickle("/workspaces/gorillatracker/sep26_clustering_results.pkl")

In [None]:
def extract_data_best_algorithms_can_do(df):
    # Group by Dataset, Algorithm, and Model
    grouped = df.groupby(['dataset', 'algorithm', 'model'])
    
    report_data = []
    
    for (dataset, algorithm, model), group in grouped:
        # Find the row with the top ARI (adjusted_rand_score)
        top_ari_row = group.loc[group['adjusted_rand_score'].idxmax()]
        
        # Get the true number of clusters (assuming it's constant for each dataset-algorithm-model combination)
        true_clusters = group['n_true_clusters'].iloc[0]
        
        # Find the row with n_clusters closest to true_clusters
        closest_clusters_row = group.loc[(group['n_clusters'] - true_clusters).abs().idxmin()]
        
        report_data.append({
            'Dataset': dataset,
            'Algorithm': algorithm,
            'Model': model,
            'True Clusters': true_clusters,
            'ARI': top_ari_row['adjusted_rand_score'],
            'Clusters': top_ari_row['n_clusters'],
            'True ARI': closest_clusters_row['adjusted_rand_score']
        })
    
    return pd.DataFrame(report_data)

def format_value(value):
    if abs(value) < 1e-10:  # Consider values very close to zero as zero
        return "0"
    return f"{value:.3f}".rstrip('0').rstrip('.').replace('-0', '0')

def latex_table_best_algorithms_can_do(df):
    latex_code = [
        r"\begin{table}[H]",
        r"    \centering",
        r"    \resizebox{\textwidth}{!}{%",
        r"    \begin{tabular}{l|c|ccc|ccc|ccc}",
        r"    \toprule",
        r"    & True & \multicolumn{3}{c|}{KMeans} & \multicolumn{3}{c|}{AgglomerativeClustering} & \multicolumn{3}{c}{HDBSCAN} \\",
        r"    Embedding Space & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters & True ARI & ARI & Clusters \\",
        r"    \midrule",
    ]

    datasets = df['Dataset'].unique()
    for i, dataset in enumerate(datasets):
        group = df[df['Dataset'] == dataset]
        latex_code.append(f"    \\multicolumn{{11}}{{l}}{{\\textbf{{{dataset}}}}} \\\\")

        # Pivot the data
        pivot = group.pivot(index='Model', columns='Algorithm', 
                            values=['True Clusters', 'ARI', 'Clusters', 'True ARI'])
        
        for model in pivot.index:
            row = pivot.loc[model]
            true_clusters = row['True Clusters'].iloc[0]  # All algorithms have same True Clusters
            values = [
                model,
                str(int(true_clusters)),
            ]
            for alg in ['KMeans', 'AgglomerativeClustering', 'HDBSCAN']:
                values.extend([
                    format_value(row['True ARI'][alg]),
                    format_value(row['ARI'][alg]),
                    str(int(row['Clusters'][alg]))
                ])
            latex_code.append("    " + " & ".join(values) + r" \\")
        
        # Add a midrule between datasets, except after the last dataset
        if i < len(datasets) - 1:
            latex_code.append(r"    \midrule")

    latex_code.extend([
        r"    \bottomrule",
        r"    \end{tabular}%",
        r"    }",
        r"    \caption{Clustering Performance Metrics for Different Models and Algorithms}",
        r"    \label{tab:clustering-algorithms}",
        r"\end{table}",
    ])

    return "\n".join(latex_code)

# Example usage:
# Assuming metrics_df is your input DataFrame
report_df = extract_data_best_algorithms_can_do(metrics_df)
print(latex_table_best_algorithms_can_do(report_df))