# Benchmark de Modelos BERT para Detec√ß√£o de Fake News

Este notebook implementa um benchmark completo de modelos BERT (fine-tuning) para classificar not√≠cias como verdadeiras ou falsas, utilizando m√∫ltiplos datasets e varia√ß√µes de BERT em portugu√™s.

## Objetivos:
1. Testar m√∫ltiplos modelos BERT pr√©-treinados em portugu√™s
2. Avaliar em m√∫ltiplos datasets (FakeBR e FakeRecogna)
3. Comparar desempenho entre diferentes arquiteturas BERT
4. Salvar resultados em formato compat√≠vel com o benchmark de LLMs

## Modelos BERT a testar:
- **BERTimbau Base**: `neuralmind/bert-base-portuguese-cased`
- **BERTimbau Large**: `neuralmind/bert-large-portuguese-cased`
- **BERT Multilingual**: `bert-base-multilingual-cased` (opcional)

‚ö†Ô∏è **Aten√ß√£o**: Este notebook pode levar v√°rias horas para executar, especialmente para o modelo Large!

In [None]:
# Configurar path do projeto
import sys
from pathlib import Path
from loguru import logger

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

# Configurar logger
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(project_root / "reports/logs/03_train_bert.log", rotation="10 MB")

logger.info(f"Project root: {project_root}")

# autoreload
%load_ext autoreload
%autoreload 2

import os
import json
import time
from datetime import datetime
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    TrainerCallback,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# M√≥dulos do projeto
from config import DATASETS, PATHS, EXPERIMENT_CONFIG
from data.data_loader import load_dataset, prepare_test_set
from models.metrics import calculate_metrics

class SleepCallback(TrainerCallback):
    """Callback para dormir entre passos e reduzir aquecimento."""
    def __init__(self, sleep_time=1.0):
        self.sleep_time = sleep_time

    def on_step_end(self, args, state, control, **kwargs):
        time.sleep(self.sleep_time)

# Configura√ß√£o de dispositivo (GPU se dispon√≠vel)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
logger.info(f"Usando dispositivo: {device}")

## 1. Configura√ß√£o do Benchmark BERT

Definir os modelos BERT a testar e os datasets.

In [None]:
# Modelos BERT a testar
BERT_MODELS = [
    "neuralmind/bert-base-portuguese-cased",      # BERTimbau Base
    "neuralmind/bert-large-portuguese-cased",     # BERTimbau Large
    "bert-base-multilingual-cased",               # BERT Multilingual
]

# Datasets a testar
DATASETS_TO_TEST = list(DATASETS.keys())

# Configura√ß√µes de treinamento
TRAINING_CONFIG = {
    "num_epochs": 3,
    "batch_size": 8,
    "eval_batch_size": 16,
    "max_length": 512,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "learning_rate": 2e-5,
    "seed": EXPERIMENT_CONFIG["random_seed"],
    "test_size": 0.2,
}

# Tamanho da amostra para teste (usar None para usar todo o dataset)
SAMPLE_SIZE = EXPERIMENT_CONFIG["test_sample_size"]

logger.info(f"Modelos BERT: {len(BERT_MODELS)}")
logger.info(f"Datasets: {len(DATASETS_TO_TEST)}")
logger.info(f"Total de experimentos: {len(BERT_MODELS) * len(DATASETS_TO_TEST)}")

## 2. Classe Dataset e Fun√ß√µes Auxiliares

In [None]:
class FakeNewsDataset(Dataset):
    """Dataset para classifica√ß√£o de fake news com BERT."""
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def prepare_datasets(df, tokenizer, test_size=0.2, seed=42, max_len=512):
    """Prepara datasets de treino e valida√ß√£o."""
    # Divis√£o Treino/Teste
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['text'].tolist(), 
        df['label'].tolist(), 
        test_size=test_size, 
        random_state=seed,
        stratify=df['label']
    )
    
    train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer, max_len=max_len)
    val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer, max_len=max_len)
    
    return train_dataset, val_dataset


def compute_metrics(pred):
    """Fun√ß√£o para calcular m√©tricas durante o treinamento."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Usar a fun√ß√£o do m√≥dulo metrics
    metrics_dict = calculate_metrics(labels.tolist(), preds.tolist())
    
    return {
        'accuracy': metrics_dict['accuracy'],
        'f1': metrics_dict['f1_score'],
        'precision': metrics_dict['precision'],
        'recall': metrics_dict['recall'],
    }

In [None]:
## 3. Fun√ß√£o de Execu√ß√£o de Experimento

## 4. Executar Benchmark

In [None]:
def run_bert_experiment(model_name, dataset_name, sample_size, seed, config):
    """Executa um √∫nico experimento BERT."""
    logger.info(f"Experimento: {model_name} | {dataset_name}")
    
    try:
        # 1. Carregar dados
        logger.info("[1/5] Carregando dataset...")
        df = load_dataset(dataset_name)
        
        # Preparar conjunto de teste e treino
        if sample_size and sample_size < len(df):
            # Separar teste primeiro usando √≠ndices originais
            if sample_size // 2 <= len(df[df["label"] == 1]) and sample_size // 2 <= len(df[df["label"] == 0]):
                # Amostragem balanceada
                samples_per_class = sample_size // 2
                df_fake = df[df["label"] == 1].sample(n=samples_per_class, random_state=seed)
                df_true = df[df["label"] == 0].sample(n=samples_per_class, random_state=seed)
                test_df = pd.concat([df_fake, df_true]).sample(frac=1, random_state=seed)
                test_indices = test_df.index.tolist()
            else:
                # Amostragem simples se n√£o houver amostras suficientes
                test_df = df.sample(n=min(sample_size, len(df)), random_state=seed)
                test_indices = test_df.index.tolist()
            
            # Usar o resto para treino
            train_df = df.drop(test_indices).reset_index(drop=True)
            test_df = test_df.reset_index(drop=True)
        else:
            # Usar todo o dataset, dividindo em treino/teste
            train_df, test_df = train_test_split(
                df, 
                test_size=config["test_size"], 
                random_state=seed,
                stratify=df['label']
            )
            test_df = test_df.reset_index(drop=True)
            train_df = train_df.reset_index(drop=True)
        
        logger.info(f"      Treino: {len(train_df)} amostras")
        logger.info(f"      Teste: {len(test_df)} amostras")
        
        # 2. Carregar tokenizer e modelo
        logger.info("[2/5] Carregando modelo e tokenizer...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
        except:
            # Fallback para BertTokenizer se AutoTokenizer falhar
            tokenizer = BertTokenizer.from_pretrained(model_name)
        
        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name, 
                num_labels=2
            )
        except:
            # Fallback para BertForSequenceClassification
            model = BertForSequenceClassification.from_pretrained(
                model_name, 
                num_labels=2
            )
        
        model = model.to(device)
        logger.info(f"      Modelo carregado: {model_name}")
        
        # 3. Preparar datasets
        logger.info("[3/5] Preparando datasets...")
        train_dataset, val_dataset = prepare_datasets(
            train_df, 
            tokenizer, 
            test_size=0.2,  # 20% do treino para valida√ß√£o
            seed=seed,
            max_len=config["max_length"]
        )
        
        # 4. Configurar treinamento
        logger.info("[4/5] Configurando treinamento...")
        output_dir = f"./results/bert_{model_name.replace('/', '_')}_{dataset_name}"
        
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=config["num_epochs"],
            per_device_train_batch_size=config["batch_size"],
            per_device_eval_batch_size=config["eval_batch_size"],
            warmup_steps=config["warmup_steps"],
            weight_decay=config["weight_decay"],
            learning_rate=config["learning_rate"],
            logging_dir=f'{output_dir}/logs',
            logging_steps=50,
            save_strategy="steps",
            save_steps=100,
            save_total_limit=2,
            load_best_model_at_end=True,
            eval_strategy="steps",
            eval_steps=100,
            seed=seed,
        )
        
        # 5. Treinar modelo
        logger.info("[5/5] Treinando modelo...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[SleepCallback(sleep_time=0.5)]
        )
        
        # Verificar checkpoints
        last_checkpoint = None
        if os.path.isdir(training_args.output_dir):
            checkpoints = [
                os.path.join(training_args.output_dir, d) 
                for d in os.listdir(training_args.output_dir) 
                if d.startswith("checkpoint-")
            ]
            if checkpoints:
                last_checkpoint = max(checkpoints, key=os.path.getctime)
                logger.info(f"      Retomando do checkpoint: {last_checkpoint}")
        
        # Treinar
        train_start = time.time()
        trainer.train(resume_from_checkpoint=last_checkpoint)
        train_time = time.time() - train_start
        
        # 6. Avaliar no conjunto de teste
        logger.info("      Avaliando no conjunto de teste...")
        test_dataset = FakeNewsDataset(
            test_df['text'].tolist(),
            test_df['label'].tolist(),
            tokenizer,
            max_len=config["max_length"]
        )
        
        eval_start = time.time()
        predictions = trainer.predict(test_dataset)
        eval_time = time.time() - eval_start
        
        preds = predictions.predictions.argmax(-1)
        true_labels = test_df['label'].tolist()
        
        # Calcular m√©tricas
        metrics = calculate_metrics(true_labels, preds.tolist())
        metrics["avg_inference_time"] = eval_time / len(test_df)
        metrics["total_inference_time"] = eval_time
        metrics["training_time"] = train_time
        
        # Resultados
        results = {
            "model": model_name,
            "strategy": "fine_tuned",  # BERT usa fine-tuning, n√£o prompting
            "dataset": dataset_name,
            "sample_size": len(test_df),
            "timestamp": datetime.now().isoformat(),
            "metrics": metrics,
            "training_config": config,
        }
        
        # Limpar mem√≥ria
        del model
        del trainer
        del tokenizer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        logger.info(f"‚úÖ Conclu√≠do! F1-Score: {metrics['f1_score']:.4f}")
        
        return results
        
    except Exception as e:
        logger.error(f"‚ùå ERRO: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return {
            "model": model_name,
            "strategy": "fine_tuned",
            "dataset": dataset_name,
            "error": str(e)
        }

In [None]:
# Executar todos os experimentos
all_results = []
results_dir = Path(PATHS['results_dir'])
results_dir.mkdir(parents=True, exist_ok=True)

total_experiments = len(BERT_MODELS) * len(DATASETS_TO_TEST)
experiment_count = 0

logger.info(f"Iniciando benchmark BERT com {total_experiments} experimentos...")

for model_name in BERT_MODELS:
    for dataset_name in DATASETS_TO_TEST:
        experiment_count += 1
        
        filename = f"{model_name.replace('/', '_')}_fine_tuned_{dataset_name}.json"
        filepath = results_dir / filename
        
        # Verificar se j√° existe resultado de sucesso
        if filepath.exists():
            try:
                with open(filepath, "r") as f:
                    existing_results = json.load(f)
                    
                if "error" not in existing_results:
                    logger.info(f"[Experimento {experiment_count}/{total_experiments}] Pular: {model_name} | {dataset_name} (J√° conclu√≠do)")
                    all_results.append(existing_results)
                    continue
                else:
                    logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (erro anterior): {model_name} | {dataset_name}")
            except (json.JSONDecodeError, Exception):
                logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (arquivo corrompido): {model_name} | {dataset_name}")

        logger.info(f"[Experimento {experiment_count}/{total_experiments}] Executando: {model_name} | {dataset_name}")
        
        results = run_bert_experiment(
            model_name=model_name,
            dataset_name=dataset_name,
            sample_size=SAMPLE_SIZE,
            seed=TRAINING_CONFIG["seed"],
            config=TRAINING_CONFIG
        )
        
        all_results.append(results)
        
        # Salvar resultados parciais
        with open(filepath, "w") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

logger.info("‚úÖ Benchmark BERT conclu√≠do!")

In [None]:
## 5. Resumo dos Resultados

In [None]:
# Resumo dos resultados
logger.info(f"‚úÖ Todos os resultados foram salvos em: {results_dir}")
logger.info(f"   Total de arquivos: {len(list(results_dir.glob('*fine_tuned*.json')))}")

# Criar DataFrame com resultados
results_data = []
for result in all_results:
    if "error" not in result:
        metrics = result["metrics"]
        results_data.append({
            "model": result["model"],
            "dataset": result["dataset"],
            "accuracy": metrics["accuracy"],
            "precision": metrics["precision"],
            "recall": metrics["recall"],
            "f1_score": metrics["f1_score"],
            "training_time": metrics.get("training_time"),
            "avg_inference_time": metrics.get("avg_inference_time"),
        })
    else:
        results_data.append({
            "model": result["model"],
            "dataset": result["dataset"],
            "accuracy": None,
            "precision": None,
            "recall": None,
            "f1_score": None,
            "error": result["error"],
        })

if results_data:
    df_results = pd.DataFrame(results_data)
    print("\n" + "="*80)
    print("RESUMO DOS RESULTADOS")
    print("="*80)
    print(df_results.to_string(index=False))
    
    # Melhor resultado
    if not df_results["f1_score"].isna().all():
        best_idx = df_results["f1_score"].idxmax()
        best = df_results.loc[best_idx]
        print("\n" + "="*80)
        print("üèÜ MELHOR RESULTADO:")
        print(f"   Modelo: {best['model']}")
        print(f"   Dataset: {best['dataset']}")
        print(f"   F1-Score: {best['f1_score']:.4f}")
        print("="*80)

## 6. Conclus√£o

‚úÖ **Benchmark BERT conclu√≠do!**

Os resultados foram salvos em arquivos JSON no diret√≥rio `reports/` no formato:
`{model_name}_fine_tuned_{dataset_name}.json`

Os resultados s√£o compat√≠veis com o formato do benchmark de LLMs e podem ser analisados junto com os outros resultados usando o notebook `04_benchmark_analysis.ipynb`.