# 02 - Compare Classifications Made by Different Models on Fact-Checking Task

This notebook is responsible for performing a comparative analysis of the classifications made by different models on the fact-checking task. It loads the results from various fine-tuned transformer models and compares their classifications against each other and against the ground truth labels.

### Imports

In [22]:
# Native
import os
import json
import logging

# Third Party
import pandas as pd

### Setup

In [23]:
# Configure logging (safe for notebook re-runs)
root_logger = logging.getLogger()

if not root_logger.handlers:
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )
else:
    # Avoid duplicate handlers when re-running notebook cells: just set levels
    root_logger.setLevel(logging.INFO)
    for h in root_logger.handlers:
        h.setLevel(logging.INFO)
    # Optionally disable propagation to avoid duplicate output from external loggers
    root_logger.propagate = False

### Constants

In [None]:
# Execution Constants
TIMESTAMP = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

# Dataset Constants
DATASET_NAME = "faketweetbr"  # ["faketweetbr", "fakebr"]

# Model Constants
MODEL_NAME = "neuralmind/bert-large-portuguese-cased"  # ["FacebookAI/xlm-roberta-large", "neuralmind/bert-large-portuguese-cased"]

# Paths Constants
RESULTS_PATH = (
    f"../data/{DATASET_NAME}/classification_results/{MODEL_NAME.split('/')[-1]}/"
)
CLASSIFICATIONS_PATH = os.path.join(RESULTS_PATH, "classifications")

### Combine Classification Results from Different Models for Comparison

In [25]:
# Name of the combined results file
combined_results_file = "combined_classification_results.csv"

# List all result files
result_files = [f for f in os.listdir(CLASSIFICATIONS_PATH) if f.endswith(".csv")]
logging.info(f"Found {len(result_files)} result files in {CLASSIFICATIONS_PATH}")

# Get only the latest file of each type
latest_files = []

for file_name in result_files:
    task_type = file_name.split("_")[0]
    model_variant = (
        file_name.split("_")[1] + "_" + file_name.split("_")[2]
        if task_type != "original"
        else None
    )

    # Filter files of the same type
    same_type_files = [
        f
        for f in result_files
        if f.startswith(task_type)
        and (
            f.split("_")[1] + "_" + file_name.split("_")[2] == model_variant
            if model_variant
            else True
        )
    ]

    # Get the latest file based on timestamp in the filename
    latest_file = max(
        same_type_files,
        key=lambda x: pd.to_datetime(
            x.split("_test-set-eval_")[-1].replace(".csv", ""),
            format="%Y-%m-%d_%H-%M-%S",
        ),
    )

    if latest_file not in latest_files:
        latest_files.append(latest_file)

# Use only the latest files for combination
result_files = latest_files
logging.info(f"Selected {len(result_files)} latest result files for combination: {result_files}")

# Create DataFrame to hold all results
all_results_df = pd.DataFrame(
		columns=["custom_id", "original_label", "original_text"]
)

for file_name in result_files:
    # Load individual results file
    results_df = pd.read_csv(os.path.join(CLASSIFICATIONS_PATH, file_name))

    # Determine task type and model variant from file name
    task_type = file_name.split("_")[0].replace("-", "_")
    model_variant = file_name.split("_")[1] if task_type != "original" else None
    column_prefix = f"{task_type}_{model_variant}" if model_variant else task_type

    # If all_results_df is empty, initialize it with the first results_df
    if all_results_df.empty:
        all_results_df["custom_id"] = results_df["custom_id"]
        all_results_df["original_label"] = results_df["original_label"]

    # For the "original" task, also add the original text
    if task_type == "original":
        all_results_df["original_text"] = results_df["text"]

    # Add predicted labels to the combined DataFrame
    all_results_df[f"{column_prefix}_prediction"] = results_df["predicted_label"]

logging.info(f"Combined results DataFrame shape: {all_results_df.shape}")

# Sort columns for better readability
all_results_df = all_results_df[
    ["custom_id", "original_label", "original_prediction"]
    + sorted(
        [
            col
            for col in all_results_df.columns
            if col not in ["custom_id", "original_label", "original_prediction", "original_text"]
        ]
    )
    + ["original_text"]
]

# Save combined results to CSV
combined_results_path = os.path.join(RESULTS_PATH, combined_results_file)
all_results_df.to_csv(combined_results_path, index=False)
logging.info(f"Saved combined results to {combined_results_path}")

2025-12-03 11:43:00,865 - INFO - Found 9 result files in ../data/faketweetbr/classification_results/xlm-roberta-large/classifications
2025-12-03 11:43:00,885 - INFO - Selected 3 latest result files for combination: ['claim-normalization_gpt-5-nano_2025-10-13_10-24-48_test-set-eval_2025-11-23_10-41-24.csv', 'claim-normalization_gpt-5_2025-10-13_10-23-02_test-set-eval_2025-11-23_10-56-43.csv', 'original_test-set-eval_2025-11-23_03-50-32.csv']
2025-12-03 11:43:00,917 - INFO - Combined results DataFrame shape: (45, 6)
2025-12-03 11:43:00,925 - INFO - Saved combined results to ../data/faketweetbr/classification_results/xlm-roberta-large/combined_classification_results.csv


### Analyse Comparisons and Generate Report

In [26]:
# Initialize report dictionary
report = {
	"dataset_name": DATASET_NAME,
	"model_name": MODEL_NAME.split("/")[-1],
}

# Retrieve list of prediction columns
prediction_columns = [col for col in all_results_df.columns if col.endswith("_prediction") and not col.startswith("original")]
logging.info(f"Prediction columns found: {prediction_columns}")

# Build analyses for each prediction column
for col in prediction_columns:
		for answer in ["correct", "incorrect"]:
				analysis_name = f"{answer}_after_{col}"
				report[analysis_name] = {
						"total": 0,
						"occurrences": [],
				}
  
# Analyze each row in the combined results DataFrame
for _, row in all_results_df.iterrows():
		original_pred = row["original_prediction"]
		original_label = row["original_label"]

		for col in prediction_columns:
				model_pred = row[col]

				if original_pred != original_label:  # Original was incorrect
						if model_pred == original_label:  # Now correct
								report[f"correct_after_{col}"]["total"] += 1
								report[f"correct_after_{col}"]["occurrences"].append(row.to_dict())
				else:  # Original was correct
						if model_pred != original_label:  # Now incorrect
								report[f"incorrect_after_{col}"]["total"] += 1
								report[f"incorrect_after_{col}"]["occurrences"].append(row.to_dict())

# Delete report entries with zero total
for key in list(report.keys()):
		if key.startswith("correct_after_") or key.startswith("incorrect_after_"):
				if report[key]["total"] == 0:
						del report[key]

# Save report to a JSON file
report_path = os.path.join(RESULTS_PATH, "comparison_report.json")

with open(report_path, "w", encoding="utf-8") as f:
		json.dump(report, f, indent=4)
		
logging.info(f"Saved comparison report to {report_path}")

2025-12-03 11:43:00,956 - INFO - Prediction columns found: ['claim_normalization_gpt-5-nano_prediction', 'claim_normalization_gpt-5_prediction']
2025-12-03 11:43:00,972 - INFO - Saved comparison report to ../data/faketweetbr/classification_results/xlm-roberta-large/comparison_report.json
