In [None]:
import glob
import nltk
import scienceplots
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from spamdetection.transforms import init_nltk
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)


plt.rcParams["font.size"] = 13

init_nltk()
plt.style.use(["science"])

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

df_final = []
for dataset in ["ling", "sms", "spamassassin", "enron"]:
    dfs = []
    files = glob.glob(
        f"../../outputs/csv/llm_{dataset}_test_0.8_train_seed_*.csv")
    for file in files:
        dfs.append(pd.read_csv(file, index_col=0))
    df = pd.concat(dfs, axis=0).dropna()
    df_final.append(df[['f1', 'precision', 'recall']].groupby(df.index).mean())

df = pd.concat(df_final, axis=1)
df_llm = df[df.index.isin(['RoBERTa', 'SetFit-mpnet', 'FLAN-T5-base'])]

df_final = []
for dataset in ["ling", "sms", "spamassassin", "enron"]:
    dfs = []
    files = glob.glob(f"../../outputs/csv/ml_{dataset}_test_0.8_train_seed*")
    for file in files:
        dfs.append(pd.read_csv(file, index_col=0))
    df = pd.concat(dfs, axis=0).dropna()
    dfs = []

    df_final.append(df[['f1', 'precision', 'recall']].groupby(
        df.index).mean().reindex(['NB', 'LR', 'KNN', 'SVM', 'XGBoost', 'LightGBM']))

df_ml = pd.concat(df_final, axis=1)

df = pd.concat([df_ml, df_llm], axis=0).reindex(['NB', 'LR', 'KNN', 'SVM',
                                                 'XGBoost', 'LightGBM', 'RoBERTa', 'SetFit-mpnet', 'FLAN-T5-base'])
df

In [None]:
df["f1"].mean(axis=1)


In [None]:
def get_few_f1(family, nb_samples, metric, aggregation):
    dfs = []
    files = glob.glob(f"../../outputs/csv/{family}_*_test_{nb_samples}_*")
    for file in files:
        dfs.append(pd.read_csv(file, index_col=0))
    df = pd.concat(dfs, axis=0)

    if aggregation == "mean" and family == "llm":
        return df[[metric]].groupby(df.index).mean().reindex(['RoBERTa', 'SetFit-mpnet', 'FLAN-T5-base']).rename(columns={metric: str(nb_samples)})
    elif aggregation == "std" and family == "llm":
        return df[[metric]].groupby(df.index).std().reindex(['RoBERTa', 'SetFit-mpnet', 'FLAN-T5-base']).rename(columns={metric: str(nb_samples)})
    elif aggregation == "mean" and family == "ml":
        return df[[metric]].groupby(df.index).mean().reindex(['NB', 'LR', 'KNN', 'SVM', 'XGBoost', 'LightGBM']).rename(columns={metric: str(nb_samples)})
    elif aggregation == "std" and family == "ml":
        return df[[metric]].groupby(df.index).std().reindex(['NB', 'LR', 'KNN', 'SVM', 'XGBoost', 'LightGBM']).rename(columns={metric: str(nb_samples)})


def plot_errorbar(family, metric, name):
    dfs = []
    for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
        dfs.append(get_few_f1(family, nb_samples, metric, "mean"))
    df_mean = pd.concat(dfs, axis=1)

    dfs = []
    for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
        dfs.append(get_few_f1(family, nb_samples, metric, "std"))
    df_std = pd.concat(dfs, axis=1)

    dfs = []
    for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
        dfs.append(get_few_f1("ml", nb_samples, metric, "mean"))
    df_mean2 = pd.concat(dfs, axis=1)
    if metric == "f1":
        df_mean.loc['Best baseline'] = df_mean2.max(axis=0)
    else:
        df_mean.loc['Avg. baseline'] = df_mean2.mean(axis=0)

    dfs = []
    for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
        dfs.append(get_few_f1("ml", nb_samples, metric, "std"))
    df_std2 = pd.concat(dfs, axis=1)
    df_std = pd.concat([df_std2, df_std], axis=0)

    x = ['4', '8', '16', '32', '64', '128', '256', 'Full']
    fmts = ['-x', '-.o', '--*', ':d', '-+', '-.X', '--|', ':D', '-1']
    labels = ['RoBERTa', 'SetFit', 'Spam-T5', 'Avg. baseline']
    plt.rcParams['font.size'] = 12

    with plt.style.context(['science', 'high-vis']):
        if metric == "training_time":
            fig, ax = plt.subplots(figsize=(4, 4))
        elif metric == "inference_time":
            fig, ax = plt.subplots(figsize=(4, 4))
        else:
            fig, ax = plt.subplots(figsize=(4, 3.3))
            labels = ['RoBERTa', 'SetFit', 'Spam-T5', 'Best baseline']
        for i in range(len(df_mean)):
            plt.errorbar(x, df_mean.values[i], fmt=fmts[i], label=labels[i])

        ax.set_xticks(x, labels=x)
        ax.set_xlabel("Number of training samples")
        ax.set_ylabel(f"Average {name}")
        if metric == "f1":
            ax.set_ylim(0, 1)
        else:
            plt.yscale("log")

        if metric == "f1":
            ax.legend(ncol=1)
        else:
            ax.legend(labels, bbox_to_anchor=(0.5, -0.45),
                      loc="lower center", ncol=2)

        fig.tight_layout()
        plt.savefig(
            f"../../outputs/pdf/errorbars_{family}_{metric}.pdf", format="pdf")
        plt.savefig(
            f"../../outputs/png/errorbars_{family}_{metric}.png", format="png", dpi=300)
        plt.show()

    return df_mean, df_std


df_mean, df_std = plot_errorbar("llm", "f1", "F1 score")
df_mean, df_std = plot_errorbar("llm", "training_time", "training time (s)")
df_mean, df_std = plot_errorbar("llm", "inference_time", "inference time (s)")

In [None]:
pd.options.display.float_format = '{:,.3f}'.format

dfs = []
for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
    dfs.append(get_few_f1("llm", nb_samples, "f1", "mean"))
df_mean = pd.concat(dfs, axis=1)

dfs = []
for nb_samples in [4, 8, 16, 32, 64, 128, 256, 0.8]:
    dfs.append(get_few_f1("ml", nb_samples, "f1", "mean"))
df_mean2 = pd.concat(dfs, axis=1)

df = pd.concat([df_mean2, df_mean], axis=0)
df

In [None]:
pd.options.display.float_format = "{:,.4f}".format

df.mean(axis=1)
df.std(axis=1)