In [1]:
import pandas as pd
import os
from pathlib import Path

ROOT = Path(os.getcwd()).resolve().parent.parent


In [4]:
df=pd.read_csv( ROOT / "Experiments/Results/effectiveness/effectiveness.csv")

In [5]:
scenarios = {
"si_si": "(S1, I1) ↔ (S2, I2)",      
"s_s":"(S1, ∅) ↔ (S2, ∅)",        
"i_i": "(∅, I1) ↔ (∅,I2)",   
}

def format_latex(mean, std):
    return f"${mean:.2f}\\pm{std:.2f}$"

# Étape 1 : moyenne et std par repeat
agg1 = (df.groupby(['scenario','embedding_model','classifier','repeat'])
          [['precision','recall','f1']]
          .agg(['mean','std'])
          .reset_index())

# Aplatir les colonnes
agg1.columns = ['_'.join(col).strip('_') for col in agg1.columns.values]

# Étape 2 : moyenne des moyennes et moyenne des std
final1 = (agg1.groupby(['scenario','embedding_model','classifier'])
              [['f1_mean','f1_std','precision_mean','precision_std',
                'recall_mean','recall_std']]
              .mean()
              .reset_index())

final1

Unnamed: 0,scenario,embedding_model,classifier,f1_mean,f1_std,precision_mean,precision_std,recall_mean,recall_std
0,i_i,albert,CatBoost,0.734278,0.205759,0.651486,0.258652,0.929602,0.113202
1,i_i,albert,KNN,0.841094,0.150953,0.925972,0.101577,0.791772,0.199076
2,i_i,albert,LogisticRegression,0.237989,0.152780,0.146279,0.109980,0.928690,0.091703
3,i_i,albert,MLP,0.721189,0.172680,0.848281,0.186766,0.651122,0.172990
4,i_i,albert,RandomForest,0.685286,0.236778,0.680370,0.293441,0.802051,0.243192
...,...,...,...,...,...,...,...,...,...
103,si_si,roberta,KNN,0.829120,0.273970,0.875347,0.237759,0.824722,0.291900
104,si_si,roberta,LogisticRegression,0.522148,0.192405,0.386509,0.172399,0.934537,0.176206
105,si_si,roberta,MLP,0.832682,0.297385,0.867321,0.260139,0.834075,0.316983
106,si_si,roberta,RandomForest,0.833682,0.342143,0.885152,0.309250,0.819739,0.354692


In [6]:
scenarios = {
    "si_si": "(S1, I1) $\\leftrightarrow$ (S2, I2)",           
    "i_i": "($\\varnothing$, I1) $\\leftrightarrow$ ($\\varnothing$, I2)",     
    "s_s": "(S1, $\\varnothing$) $\\leftrightarrow$ (S2, $\\varnothing$)",        
}
classifiers_to_keep = ['RandomForest', 'LogisticRegression', 'KNN', 'MLP', 'CatBoost', 'XGBoost']

for abr, scenario in scenarios.items():

    grouped =final1 [final1.scenario==abr]

    embeddings = grouped['embedding_model'].unique()
    embeddings_bis = [e.replace("_meta_CPU_corrected.csv", "") for e in embeddings]

    # ----- ENTÊTE LATEX -----
    header = (
        "\\begin{table}[!ht]\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\\caption{{Precision, Recall, and F1-score (mean $\\pm$ std) per classifier and embedding model for scenario {scenario}.}}\n"
        f"\\label{{tab:prec_rec_f1_{abr}}}\n"
        "\\resizebox{\\textwidth}{!}{%\n"
        "\\begin{tabular}{l c " + "c" * len(embeddings_bis) + " c}\n"
        "\\toprule\n"
        "\\textbf{Classifier} & \\textbf{Metric} & " + " & ".join(embeddings_bis) + " & \\textbf{Classifier Mean} \\\\\n"
        "\\midrule\n"
    )

    # ----- CORPS -----
    body = ""
    for clf in classifiers_to_keep:
        sub = grouped[grouped['classifier'] == clf]
        for i, (metric, (mcol, scol)) in enumerate({
            "Precision": ("precision_mean", "precision_std"),
            "Recall": ("recall_mean", "recall_std"),
            "F1": ("f1_mean", "f1_std"),
        }.items()):
            # première ligne du classifieur avec multirow
            row = "  & " + metric if i > 0 else f"\\multirow{{3}}{{*}}{{{clf}}} & {metric}"
            values = []
            for emb in embeddings:
                mean = sub.loc[sub['embedding_model'] == emb, mcol].values[0]
                std = sub.loc[sub['embedding_model'] == emb, scol].values[0]
                values.append(format_latex(mean, std))
            # Moyenne du classifieur
            mean = sub[mcol].mean()
            std = sub[scol].mean()
            values.append(format_latex(mean, std))
            row += " & " + " & ".join(values) + " \\\\\n"
            body += row
        body += "\\midrule\n"

    # ----- MOYENNE DES EMBEDDINGS -----
    for i, (metric, (mcol, scol)) in enumerate({
        "Precision": ("precision_mean", "precision_std"),
        "Recall": ("recall_mean", "recall_std"),
        "F1": ("f1_mean", "f1_std"),
    }.items()):
        row = "  & " + metric if i > 0 else "\\multirow{3}{*}{\\textbf{Embedding Mean}} & " + metric
        values = []
        for emb in embeddings:
            mean = grouped.loc[grouped['embedding_model'] == emb, mcol].mean()
            std = grouped.loc[grouped['embedding_model'] == emb, scol].mean()
            values.append(format_latex(mean, std))
        values.append("--")
        row += " & " + " & ".join(values) + " \\\\\n"
        body += row

    # ----- FIN DU TABLEAU -----
    footer = "\\bottomrule\n\\end{tabular}\n}\n\\end{table}\n"

    latex_code = header + body + footer

    print("\n")
    print(latex_code)




\begin{table}[!ht]
\footnotesize
\centering
\caption{Precision, Recall, and F1-score (mean $\pm$ std) per classifier and embedding model for scenario (S1, I1) $\leftrightarrow$ (S2, I2).}
\label{tab:prec_rec_f1_si_si}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c cccccc c}
\toprule
\textbf{Classifier} & \textbf{Metric} & albert & bart & bert & distilbert & minilm & roberta & \textbf{Classifier Mean} \\
\midrule
\multirow{3}{*}{RandomForest} & Precision & $0.89\pm0.29$ & $0.87\pm0.32$ & $0.95\pm0.17$ & $0.95\pm0.19$ & $0.84\pm0.34$ & $0.89\pm0.31$ & $0.90\pm0.27$ \\
  & Recall & $0.74\pm0.35$ & $0.64\pm0.36$ & $0.77\pm0.30$ & $0.83\pm0.26$ & $0.69\pm0.39$ & $0.82\pm0.35$ & $0.75\pm0.34$ \\
  & F1 & $0.78\pm0.33$ & $0.70\pm0.34$ & $0.83\pm0.26$ & $0.87\pm0.24$ & $0.73\pm0.37$ & $0.83\pm0.34$ & $0.79\pm0.31$ \\
\midrule
\multirow{3}{*}{LogisticRegression} & Precision & $0.28\pm0.15$ & $0.28\pm0.19$ & $0.32\pm0.19$ & $0.43\pm0.22$ & $0.29\pm0.18$ & $0.39\pm0.17$ & $0.33\pm0.19$ \\
  & 