In [1]:
from itertools import product
import json
import os

from tqdm.auto import tqdm
import pandas as pd

OUTPUT_DIR = "../output/a100"
LLM_OUTPUT_DIR = "../output/llm"

In [2]:
experiments = list()

for experiment_name in tqdm(os.listdir(OUTPUT_DIR)):
    if experiment_name == "output":
        continue

    experiment_path = os.path.join(OUTPUT_DIR, experiment_name)

    raw_params = experiment_name.split("_")

    if "extra" in raw_params[1]:
        raw_params[0] = "processed_extra"
        raw_params.remove("extra")
    elif "processed" in experiment_path and "COVID19" in experiment_path and "v5" in experiment_path:
        continue

    params = {
        "data": raw_params[0],
        "dataset": raw_params[1].replace("-", "."),
        "learning_rate": raw_params[2],
        "train_batch_size": eval(raw_params[3]),
        "dropout": eval(raw_params[4].replace("-", ".")),
        "version": raw_params[5]   
    }

    if len(raw_params) > 6:
        params["search_max_size"] = eval(raw_params[6])
        params["filter_domain"] = eval(raw_params[7])
        params["filter_social_media"] = eval(raw_params[8])

    for idx, char in enumerate(params["learning_rate"]):
        if char == "-" and params["learning_rate"][idx-1] != "e":
            params["learning_rate"] = \
                params["learning_rate"][:idx] + "." + params["learning_rate"][idx+1:]

    params["learning_rate"] = eval(params["learning_rate"])
    params["experiment_name"] = experiment_name

    params = pd.Series(params)

    with open(os.path.join(experiment_path, "eval_results.txt")) as f:
        eval_metrics = f.read()
        eval_metrics = eval_metrics.strip().split("\n")

    eval_metrics = [metric.split(" = ") for metric in eval_metrics]
    eval_metrics = {metric[0]: eval(metric[1]) for metric in eval_metrics}
    eval_metrics = pd.Series(eval_metrics)
    eval_metrics.rename(
        {
            "mcc": "test_mcc",
            "accuracy": "test_acc",
            "f1_score": "test_f1",
            "tp": "test_tp",
            "tn": "test_tn",
            "fp": "test_fp",
            "fn": "test_fn",
            "auroc": "test_auroc",
            "auprc": "test_auprc",
            "eval_loss": "test_loss"
        },
        inplace=True
    )


    train_metrics = pd.read_csv(os.path.join(experiment_path, "training_progress_scores.csv"))
    train_metrics.rename(columns={
            "mcc": "dev_mcc",
            "accuracy": "dev_acc",
            "f1_score": "dev_f1",
            "tp": "dev_tp",
            "tn": "dev_tn",
            "fp": "dev_fp",
            "fn": "dev_fn",
            "auroc": "dev_auroc",
            "auprc": "dev_auprc",
            "global_step": "train_step",
            "eval_loss": "dev_loss"
        },
        inplace=True
    )

    train_metrics = train_metrics[train_metrics["dev_f1"] == train_metrics["dev_f1"].max()]
    train_metrics = train_metrics.iloc[-1]
    train_metrics[["dev_tp", "dev_tn", "dev_fp", "dev_fn"]] \
         = train_metrics[["dev_tp", "dev_tn", "dev_fp", "dev_fn"]].astype(int)
    

    experiment = pd.concat(
        [params, train_metrics, eval_metrics]
    )

    experiments.append(experiment)

experiments = pd.DataFrame(experiments)


  0%|          | 0/613 [00:00<?, ?it/s]

In [3]:
extra = experiments[experiments["data"] == "processed_extra"]

extra = extra[(extra["search_max_size"] == 1) & (~extra["filter_domain"])].drop(columns="search_max_size")

dev_max_extra = extra.groupby(
    ["dataset", "data", "filter_social_media"]
)
dev_max_extra = dev_max_extra["dev_f1"].max()

best_dev_extra = extra[
    extra.apply(lambda row: row["dev_f1"] == dev_max_extra.loc[
        (row["dataset"], row["data"],  row["filter_social_media"])
    ], axis=True)
]
best_dev_extra = best_dev_extra.drop_duplicates(
    subset=["dataset", "data", "filter_social_media"]
)

train_max_extra = best_dev_extra.groupby(
    ["dataset", "data", "filter_social_media"]
)
train_max_extra = train_max_extra[["test_f1", "test_acc"]].max()
train_max_extra = (train_max_extra*100).round(1)
train_max_extra

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,test_f1,test_acc
dataset,data,filter_social_media,Unnamed: 3_level_1,Unnamed: 4_level_1
COVID19.BR,processed_extra,False,82.1,82.4
COVID19.BR,processed_extra,True,77.9,78.3
Fake.br,processed_extra,False,99.2,99.2
Fake.br,processed_extra,True,98.7,98.8


In [4]:
no_extra = experiments[experiments["data"] != "processed_extra"]

dev_max_group = no_extra.groupby(["dataset", "data"])["dev_f1"].max()

best_dev = no_extra[
    no_extra.apply(lambda row: row["dev_f1"] == dev_max_group.loc[(row["dataset"], row["data"])], axis=True)
]
best_dev = best_dev.drop_duplicates(subset=["data", "dataset"])

train_max_group = best_dev.groupby(["dataset", "data"])[["test_f1", "test_acc"]].max()
train_max_group = (train_max_group*100).round(1)
train_max_group

Unnamed: 0_level_0,Unnamed: 1_level_0,test_f1,test_acc
dataset,data,Unnamed: 2_level_1,Unnamed: 3_level_1
COVID19.BR,processed,81.1,81.4
COVID19.BR,raw,81.9,82.1
Fake.br,processed,98.9,98.9
Fake.br,raw,99.6,99.6


In [None]:
results = pd.concat([train_max_extra.reset_index(), train_max_group.reset_index()])
result = results.fillna(False)

results["data"] = results.apply(
    lambda row: "filter_social_media" if row["filter_social_media"] == True else row["data"], axis=True
)
results["data"] = results["data"].map(
    {
        "raw": "1. Original",
        "processed": "2. Validated",
        "processed_extra": "3.a) Validated e full enrichment",
        "filter_social_media": "3.b) Validated e filtered enrichment"
    }
)

results.drop(columns="filter_social_media", inplace=True)

results.rename(columns={
    "data": "processing",
}, inplace=True)

results

Unnamed: 0,Conjunto de dados,Processamento,F1 macro,Acurácia
0,COVID19.BR,3.a) Validado e Enquecido completo,82.1,82.4
1,COVID19.BR,3.b) Validado e Enquecido filtrado,77.9,78.3
2,Fake.br,3.a) Validado e Enquecido completo,99.2,99.2
3,Fake.br,3.b) Validado e Enquecido filtrado,98.7,98.8
0,COVID19.BR,2. Validado,81.1,81.4
1,COVID19.BR,1. Original,81.9,82.1
2,Fake.br,2. Validado,98.9,98.9
3,Fake.br,1. Original,99.6,99.6


In [6]:
experiments = [best_dev, best_dev_extra]
experiments = pd.concat(experiments)
df_analysis = {
    d: pd.read_parquet(f"../data/parquet/split/{d}.parquet") for d in ["Fake.br", "COVID19.BR"]
}


df_analysis = {d: 
    df_analysis[d][df_analysis[d]["new_split"] == "test"][["text_no_url", "label", "google_search_results"]] 
    for d in df_analysis}

In [7]:
for _, experiment in experiments.iterrows():
    if experiment["dataset"] == "COVID19.BR":
        continue

    analysis = pd.read_table(
        os.path.join("..", "output", "a100", experiment["experiment_name"], "test.tsv"), index_col=0
    )
    analysis.index = [str(idx) for idx in analysis.index]
    name = experiment["data"]

    if experiment["filter_social_media"] == True:
        name += "_filter_social_media"

    df_analysis[experiment["dataset"]][name] = analysis["pred_correct"]

In [8]:
llm_experiments = list()
analysis_llm = dict()

for experiment_name in tqdm(os.listdir(LLM_OUTPUT_DIR)):
    experiment_path = os.path.join(LLM_OUTPUT_DIR, experiment_name)

    if not os.path.exists(os.path.join(experiment_path, "result.json")):
        continue

    raw_params = experiment_name.split("_")

    if "gemini-gemini" in raw_params[0]:
        raw_params[0] = raw_params[0].replace("gemini-gemini", "gemini/gemini")
    elif raw_params[:2] == ['vertex', 'ai-gemini-2-0-flash-001']:
        raw_params[0] = 'vertex_ai/gemini-2-0-flash-001'
        raw_params.remove('ai-gemini-2-0-flash-001')

    if "extra" in raw_params[2]:
        raw_params[1] = "processed_extra"
        raw_params.remove("extra")

    link = raw_params[4] == "link"

    if link:
        raw_params.remove("link")

    if len(raw_params) > 5:
        raw_params[5] = "_".join(raw_params[5:])
        raw_params = raw_params[:6]

    params = {
        "llm": raw_params[0],
        "data": raw_params[1],
        "dataset": raw_params[2].replace("-", "."),
        "few_shot": raw_params[3],
        "version": raw_params[4],
        "search_link": link,
        "search_filter": raw_params[5] if len(raw_params) > 5 else None   
    }

    if params["version"] != "v4" and params["data"] == "processed_extra":
        continue

    if params["llm"] != "gemini/gemini-1-5-flash":
        continue

    if params["search_filter"] in ["filter_social_media_2", "filter_domains"]:
        continue

    with open(os.path.join(experiment_path, "result.json")) as f:
        report = json.load(f)

    preds = pd.read_json(os.path.join(experiment_path, "requests_results.jsonl"), lines=True)

    params["test_acc"] = report["accuracy"]
    params["test_f1"] = report["classification_report"]["macro avg"]["f1-score"]
    params["test_tp"] = report["confusion_matrix"][0][0]
    params["test_tn"] = report["confusion_matrix"][1][1]
    params["test_fp"] = report["confusion_matrix"][0][1]
    params["test_fn"] = report["confusion_matrix"][1][0]

    llm_experiments.append(params)

    if params["dataset"] not in analysis_llm:
        analysis_llm[params["dataset"]] = preds

        analysis_llm[params["dataset"]]["text_a"] = analysis_llm[params["dataset"]]["dataset_row"].apply(lambda d: d.get("text_a", d.get("text")))


        if "text_b" in analysis_llm[params["dataset"]]["dataset_row"][0].keys():
            analysis_llm[params["dataset"]]["text_b"] = analysis_llm[params["dataset"]]["dataset_row"].apply(lambda d: d["text_b"])

        analysis_llm[params["dataset"]]["labels"] = analysis_llm[params["dataset"]]["dataset_row"].apply(lambda d: d["labels"])

        name = params["data"]
        
        if params["search_filter"]:
            name += f"_{params['search_filter']}"

        analysis_llm[params["dataset"]].rename(
            columns={"is_correct": name}, inplace=True)

        #analysis_llm[params["dataset"]] = analysis_llm[params["dataset"]].set_index("dataset_index")

        analysis_llm[params["dataset"]].drop(
            columns=[
                "response_text","order", "dataset_row_tag_label", 
                "response_tag_label", "config", "response_raw", "response_label", 
                "model", "dataset_row"
            ],
            inplace=True
        )
    else:
        if "text_b" not in analysis_llm[params["dataset"]].columns and "text_b" in preds["dataset_row"][0]:
            analysis_llm[params["dataset"]]["text_b"] = preds["dataset_row"].apply(lambda d: d["text_b"])
        
        name = params["data"]
        if params["search_filter"]:
            name += f"_{params['search_filter']}"

        analysis_llm[params["dataset"]][name] = preds["is_correct"]

llm_experiments = pd.DataFrame(llm_experiments)
llm_experiments["filter_social_media"] = llm_experiments["search_filter"].map(lambda s: pd.notna(s))
llm_experiments.drop(columns=["search_link", "version", "few_shot", "llm", "search_filter"], inplace=True)
llm_experiments["data"] = llm_experiments.apply(
    lambda row: "filter_social_media" if row["filter_social_media"] == True else row["data"], axis=True
)

  0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
llm_experiments["data"] = llm_experiments["data"].map(
    {
        "raw": "1. Original",
        "processed": "2. Validate",
        "processed_extra": "3.a) Validado e full enrichment",
        "filter_social_media": "3.b) Validado e filtered enrichment"
    }
)

llm_experiments.drop(columns="filter_social_media", inplace=True)

llm_experiments.rename(columns={
    "data": "processing",
}, inplace=True)

llm_experiments = llm_experiments.set_index(
    ["dataset", "processing"]
)
llm_experiments = llm_experiments.sort_index()

In [None]:
llm_experiments