# 01 - Fine Tune Transformer for Fact-Checking

This notebook is responsible for performing the fact-checking task on the claims that were extracted and normalized in the previous notebook. It loads the datasets generated previously and fine-tunes a transformer for fact-checking the claims as true or false.

### Imports

In [1]:
# Native
import os
import json
import shutil
import logging

# Third-party
import torch
import sklearn
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from emoji import demojize
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
		Trainer,
    EarlyStoppingCallback,
)

  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
# Configure logging (safe for notebook re-runs)
root_logger = logging.getLogger()

if not root_logger.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
else:
    # Avoid duplicate handlers when re-running notebook cells: just set levels
    root_logger.setLevel(logging.INFO)
    for h in root_logger.handlers:
        h.setLevel(logging.INFO)
    # Optionally disable propagation to avoid duplicate output from external loggers
    root_logger.propagate = False

### Constants

In [None]:
# Execution Constants
TIMESTAMP = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

# Dataset Constants
DATASET_NAME = "fakebr" # ["faketweetbr", "fakebr"]
DATASET_TASK = "original"  # ["original", "claim_normalization"]
DATASET_PROCESS_ID = ""

# Model Constants
MODEL_NAME = "neuralmind/bert-large-portuguese-cased"  # ["FacebookAI/xlm-roberta-large", "neuralmind/bert-large-portuguese-cased"]
SAVE_MODEL = True # Whether to save the fine-tuned model or not. This is necessary for loading the best model after fine-tuning. BEWARE: it may consume a lot of disk space!

# Paths Constants
DATA_PATH = f"../data/{DATASET_NAME}/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}" # Last path corresponds to the task that original data (i.e., original, claim_normalization).
OUTPUT_PATH = f"../data/{DATASET_NAME}/fine-tuning/{MODEL_NAME.split('/')[-1]}/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}{TIMESTAMP}"
MODEL_PATH = f"{OUTPUT_PATH}/model/"
METRICS_PATH = f"{OUTPUT_PATH}/metrics/"
RESULTS_PATH = f"../data/{DATASET_NAME}/classification_results/{MODEL_NAME.split('/')[-1]}/"

### Verify GPU Availability and Info

In [4]:
# Log GPU info
if torch.cuda.is_available():
    logging.info(
        f"Torch CUDA version: {torch.version.cuda}; GPU: {torch.cuda.get_device_name(0)}"
    )
else:
    logging.info("No GPU found, training on CPU")

2025-11-24 16:29:32,807 - INFO - Torch CUDA version: 12.4; GPU: NVIDIA GeForce RTX 4060 Ti


### Load and Setup Tokenizer

In [5]:
# Load and Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, do_lower_case=False, normalization=True
)
tokenizer.demoizer = tokenizer.demojizer = lambda x: demojize(x, language="pt")

# Preprocessing Function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

### Load Dataset

In [6]:
# Map Label Function
def map_label(example):
    lab = example["label"]

    if isinstance(lab, str):
        example["label"] = label2id.get(lab, None)  # None -> will be filtered if needed

    return example

# Filter Function
def filter_missing_labels(example):
    return example["label"] is not None

# Define dataset files
train_file = DATA_PATH + 'train.csv'
validation_file = DATA_PATH +  'validation.csv'
test_file = DATA_PATH +  'test.csv'

# Define label mappings
label2id = {"true": 0, "fake": 1}
id2label = {v: k for k, v in label2id.items()}

# Load dataset
dataset = load_dataset('csv', data_files={'train': train_file, 'validation': validation_file, 'test': test_file})

# Rename columns
dataset = dataset.rename_column("classificacao", "label")

# Apply label mapping
dataset = dataset.map(map_label, batched=False)

# Tokenize dataset
remove_cols = [c for c in dataset["train"].column_names if c not in ("custom_id", "text", "label")]
tokenized = dataset.map(preprocess_function, batched=True, remove_columns=remove_cols)

# Filter out examples with missing labels
tokenized = tokenized.filter(filter_missing_labels)

Map: 100%|██████████| 719/719 [00:00<00:00, 2485.80 examples/s]
Filter: 100%|██████████| 719/719 [00:00<00:00, 4511.27 examples/s]


### Load Model

In [7]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
  MODEL_NAME, 
  problem_type="single_label_classification",
  num_labels=2,
	label2id=label2id,
	id2label=id2label,
).to('cuda' if torch.cuda.is_available() else 'cpu')

# Check if model is using GPU or CPU
logging.info(f"Model device: {next(model.parameters()).device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-24 16:29:39,632 - INFO - Model device: cuda:0


### Define Metrics Computation Function

In [8]:
# Metrics Computation Function
def compute_metrics(eval_pred):
    """Compute metrics for the evaluation"""
    # Unpack predictions and labels
    preds, labels = eval_pred

    # Get predictions
    predictions = np.argmax(preds, axis=-1)

    # Load metrics
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    # Compute and return metrics
    return clf_metrics.compute(predictions=predictions, references=labels)

### Define Training Arguments

In [9]:
# Define EarlyStoppingCallback with patience of 2 epochs
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement for 2 epochs
)

# Training Arguments
training_args = TrainingArguments(
    output_dir=f"{MODEL_PATH}/checkpoints" if SAVE_MODEL else None,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=10,
    logging_strategy="epoch",
    weight_decay=0.01,
    eval_strategy="epoch",
    do_eval=True,
    save_strategy="epoch" if SAVE_MODEL else "no",
    save_total_limit=3,
    load_best_model_at_end=SAVE_MODEL,
    metric_for_best_model="eval_loss",
    fp16=False, # Enable if using NVIDIA GPUs with Tensor Cores
    bf16=True,  # Enable automatic mixed precision (Ada Lovelace Architecture).
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

  trainer = Trainer(


### Train Model

In [10]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.27,0.151916,0.961003,0.959184,0.967647,0.950867
2,0.1104,0.184785,0.949861,0.945783,0.987421,0.907514
3,0.054,0.122322,0.973538,0.972424,0.976676,0.968208
4,0.028,0.151675,0.967967,0.966811,0.965418,0.968208
5,0.0122,0.141273,0.97493,0.973913,0.976744,0.971098


TrainOutput(global_step=1200, training_loss=0.09493752777576446, metrics={'train_runtime': 17507.3892, 'train_samples_per_second': 3.283, 'train_steps_per_second': 0.137, 'total_flos': 2.678370738204672e+16, 'train_loss': 0.09493752777576446, 'epoch': 5.0})

### Save Best Model Metrics and Predictions on Test Set

In [11]:
# Check if paths exists
os.makedirs(METRICS_PATH, exist_ok=True)
os.makedirs(os.path.join(RESULTS_PATH, "classifications"), exist_ok=True)

# Classify Test Set and Save Results
test_results = trainer.predict(tokenized["test"])

# Prepare results DataFrame
test_preds = np.argmax(test_results.predictions, axis=-1)
test_labels = test_results.label_ids

results_df = pd.DataFrame(
    {
        "custom_id": tokenized["test"]["custom_id"],
        "text": tokenized["test"]["text"],
        "original_label": [id2label[label] for label in test_labels],
        "predicted_label": [id2label[pred] for pred in test_preds],
    }
)

# Save results to CSV
results_csv_path = os.path.join(
    RESULTS_PATH,
    f"classifications/{DATASET_TASK.replace("_", "-")}{("_" + DATASET_PROCESS_ID) if DATASET_PROCESS_ID else ""}_test-set-eval_{TIMESTAMP}.csv",
)
results_df.to_csv(results_csv_path, index=False)
logging.info(f"Saved test set evaluation results to {results_csv_path}.")

# Save Metrics
with open(
		os.path.join(
				METRICS_PATH,
				f"best_model_metrics.json"
		),
		"w",
) as f:
		json.dump(test_results.metrics, f, indent=4)

2025-11-24 21:21:46,542 - INFO - Saved test set evaluation results to ../data/fakebr/classification_results/bert-large-portuguese-cased/classifications/original_test-set-eval_2025-11-24_16-29-32.csv.


### Delete Saved Model

In [13]:
# Delete model directory if it exists
if os.path.exists(MODEL_PATH):
		shutil.rmtree(MODEL_PATH)
		logging.info(f"Model saving disabled. Deleted model directory at {MODEL_PATH}.")

2025-11-24 21:21:57,126 - INFO - Model saving disabled. Deleted model directory at ../data/fakebr/fine-tuning/bert-large-portuguese-cased/original/2025-11-24_16-29-32/model/.


### Clean GPU VRAM

In [14]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()