# 03 - Benchmark Completo

Este notebook executa o benchmark completo comparando m√∫ltiplos modelos LLM.

## Objetivos
1. Testar todos os modelos configurados
2. Comparar todas as estrat√©gias de prompting
3. Avaliar em m√∫ltiplos datasets
4. Gerar relat√≥rio consolidado

‚ö†Ô∏è **Aten√ß√£o**: Este notebook pode levar v√°rias horas para executar!

In [1]:
# Configurar path do projeto
import sys
from pathlib import Path
from loguru import logger

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

# Configurar logger
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(project_root / "reports/logs/03_full_benchmark.log", rotation="10 MB")

logger.info(f"Project root: {project_root}")

[32m2026-01-22 17:26:45.194[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mProject root: /Users/joaoroldi/Projects/tcc2[0m


In [2]:
# Imports
import json
import time
from datetime import datetime
import pandas as pd
import torch
from tqdm.notebook import tqdm

# M√≥dulos do projeto
from config import MODELS, PROMPTING_STRATEGIES, DATASETS, EXPERIMENT_CONFIG, PATHS
from data.data_loader import load_dataset, prepare_test_set
from models.model_handler import ModelHandler
from models.prompts import PromptBuilder
from models.metrics import calculate_metrics, generate_report

# Verificar GPU/MPS
logger.info(f"CUDA dispon√≠vel: {torch.cuda.is_available()}")
logger.info(f"MPS (Mac) dispon√≠vel: {torch.backends.mps.is_available()}")

if torch.cuda.is_available():
    logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
    logger.info(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
elif torch.backends.mps.is_available():
    logger.info("Usando Apple Silicon GPU (MPS)")

[32m2026-01-22 17:26:47.052[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mCUDA dispon√≠vel: False[0m
[32m2026-01-22 17:26:47.061[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mMPS (Mac) dispon√≠vel: True[0m
[32m2026-01-22 17:26:47.061[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1mUsando Apple Silicon GPU (MPS)[0m


## 1. Configura√ß√£o do Benchmark

In [3]:
# Configura√ß√µes
SAMPLE_SIZE = 1000  # N√∫mero de amostras por dataset
SEED = 42

# Modelos a testar (pode reduzir para teste r√°pido)
MODELS_TO_TEST = MODELS[:2]  # Primeiros 2 modelos para teste r√°pido
# MODELS_TO_TEST = MODELS  # Descomentar para testar todos

# Estrat√©gias a testar
STRATEGIES_TO_TEST = PROMPTING_STRATEGIES

# Datasets a testar
DATASETS_TO_TEST = list(DATASETS.keys())

logger.info(f"Modelos: {len(MODELS_TO_TEST)}")
logger.info(f"Estrat√©gias: {len(STRATEGIES_TO_TEST)}")
logger.info(f"Datasets: {len(DATASETS_TO_TEST)}")
logger.info(f"Total de experimentos: {len(MODELS_TO_TEST) * len(STRATEGIES_TO_TEST) * len(DATASETS_TO_TEST)}")

[32m2026-01-22 17:26:47.070[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mModelos: 2[0m
[32m2026-01-22 17:26:47.070[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mEstrat√©gias: 3[0m
[32m2026-01-22 17:26:47.071[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mDatasets: 2[0m
[32m2026-01-22 17:26:47.071[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mTotal de experimentos: 12[0m


## 2. Fun√ß√£o de Execu√ß√£o de Experimento

In [4]:
def extract_prediction(response: str) -> int:
    """Extrai predi√ß√£o da resposta do modelo."""
    response_lower = response.lower().strip()
    
    fake_keywords = ["falsa", "fake", "falso", "mentira", "desinforma√ß√£o", "1"]
    true_keywords = ["verdadeira", "verdadeiro", "real", "ver√≠dica", "0"]
    
    for keyword in fake_keywords:
        if keyword in response_lower:
            return 1
    
    for keyword in true_keywords:
        if keyword in response_lower:
            return 0
    
    return -1


def run_experiment(model_name, strategy, dataset_name, sample_size, seed):
    """Executa um √∫nico experimento."""
    logger.info(f"Experimento: {model_name} | {strategy} | {dataset_name}")
    
    try:
        # 1. Carregar dados
        logger.info("[1/4] Carregando dataset...")
        df = load_dataset(dataset_name)
        test_df = prepare_test_set(df, sample_size=sample_size, seed=seed)
        logger.info(f"      Amostras: {len(test_df)}")
        
        # 2. Carregar modelo
        logger.info("[2/4] Carregando modelo...")
        model_handler = ModelHandler(model_name)
        vram_usage = model_handler.get_vram_usage()
        logger.info(f"      VRAM: {vram_usage:.2f} GB")
        
        # 3. Preparar prompts
        logger.info("[3/4] Preparando prompts...")
        prompt_builder = PromptBuilder(strategy)
        
        if strategy == "few_shot":
            examples_df = df[~df.index.isin(test_df.index)].sample(
                n=EXPERIMENT_CONFIG["few_shot_examples"],
                random_state=seed
            )
            prompt_builder.set_examples(examples_df)
        
        # 4. Executar infer√™ncia
        logger.info("[4/4] Executando infer√™ncia...")
        predictions = []
        inference_times = []
        
        for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="      "):
            prompt = prompt_builder.build_prompt(row["text"])
            
            start_time = time.time()
            response = model_handler.generate(prompt)
            end_time = time.time()
            
            prediction = extract_prediction(response)
            predictions.append(prediction)
            inference_times.append(end_time - start_time)
        
        # Calcular m√©tricas
        true_labels = test_df["label"].tolist()
        metrics = calculate_metrics(true_labels, predictions)
        
        metrics["avg_inference_time"] = sum(inference_times) / len(inference_times)
        metrics["total_inference_time"] = sum(inference_times)
        metrics["vram_usage_gb"] = vram_usage
        
        # Resultados
        results = {
            "model": model_name,
            "strategy": strategy,
            "dataset": dataset_name,
            "sample_size": sample_size,
            "timestamp": datetime.now().isoformat(),
            "metrics": metrics,
        }
        
        # Liberar mem√≥ria
        model_handler.unload()
        torch.cuda.empty_cache()
        
        logger.info(f"‚úÖ Conclu√≠do! F1-Score: {metrics['f1_score']:.4f}")
        
        return results
        
    except Exception as e:
        logger.error(f"‚ùå ERRO: {e}")
        return {
            "model": model_name,
            "strategy": strategy,
            "dataset": dataset_name,
            "error": str(e)
        }

## 3. Executar Benchmark

In [None]:
# Executar todos os experimentos
all_results = []
results_dir = Path(PATHS['results_dir'])
results_dir.mkdir(parents=True, exist_ok=True)

total_experiments = len(MODELS_TO_TEST) * len(STRATEGIES_TO_TEST) * len(DATASETS_TO_TEST)
experiment_count = 0

logger.info(f"Iniciando benchmark com {total_experiments} experimentos...")

for model_name in MODELS_TO_TEST:
    for strategy in STRATEGIES_TO_TEST:
        for dataset_name in DATASETS_TO_TEST:
            experiment_count += 1
            
            filename = f"{model_name.replace('/', '_')}_{strategy}_{dataset_name}.json"
            filepath = results_dir / filename
            
            # Verificar se j√° existe resultado de sucesso
            if filepath.exists():
                try:
                    with open(filepath, "r") as f:
                        existing_results = json.load(f)
                        
                    if "error" not in existing_results:
                        logger.info(f"[Experimento {experiment_count}/{total_experiments}] Pular: {model_name} | {strategy} | {dataset_name} (J√° conclu√≠do)")
                        all_results.append(existing_results)
                        continue
                    else:
                        logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (erro anterior): {model_name} | {strategy} | {dataset_name}")
                except (json.JSONDecodeError, Exception):
                    logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (arquivo corrompido): {model_name} | {strategy} | {dataset_name}")

            logger.info(f"[Experimento {experiment_count}/{total_experiments}] Executando: {model_name} | {strategy} | {dataset_name}")
            
            results = run_experiment(
                model_name=model_name,
                strategy=strategy,
                dataset_name=dataset_name,
                sample_size=SAMPLE_SIZE,
                seed=SEED
            )
            
            all_results.append(results)
            
            # Salvar resultados parciais
            with open(filepath, "w") as f:
                json.dump(results, f, indent=2)

logger.info("‚úÖ Benchmark conclu√≠do!")

[32m2026-01-22 17:26:47.088[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mIniciando benchmark com 12 experimentos...[0m
[32m2026-01-22 17:26:47.089[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 1/12] Pular: Qwen/Qwen2-1.5B-Instruct | zero_shot | fakebr (J√° conclu√≠do)[0m
[32m2026-01-22 17:26:47.089[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 2/12] Pular: Qwen/Qwen2-1.5B-Instruct | zero_shot | fakerecogna (J√° conclu√≠do)[0m
[32m2026-01-22 17:26:47.090[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 3/12] Pular: Qwen/Qwen2-1.5B-Instruct | few_shot | fakebr (J√° conclu√≠do)[0m
[32m2026-01-22 17:26:47.090[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 4/12] Pular: Qwen/Qwen2-1.5B-Instruct | few_shot | fakerecogna (J√° conclu√≠do)[0m
[32m2026-01-22 17:26:47.091[0m |

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps
[32m2026-01-22 17:27:04.356[0m | [1mINFO    [0m | [36mmodels.model_handler[0m:[36m_load_model[0m:[36m148[0m - [1mModelo carregado com sucesso![0m
[32m2026-01-22 17:27:04.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m34[0m - [1m      VRAM: 0.00 GB[0m
[32m2026-01-22 17:27:04.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m37[0m - [1m[3/4] Preparando prompts...[0m
[32m2026-01-22 17:27:04.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m48[0m - [1m[4/4] Executando infer√™ncia...[0m


      :   0%|          | 0/1000 [00:00<?, ?it/s]

## 4. Gerar Relat√≥rio Consolidado

In [None]:
# Gerar relat√≥rio
results_dir = Path(PATHS['results_dir'])
df_report = generate_report(all_results, results_dir)

# Exibir relat√≥rio
df_report

## 5. An√°lise dos Resultados

In [None]:
# Melhor modelo por F1-Score
best_idx = df_report['f1_score'].idxmax()
best = df_report.loc[best_idx]

logger.info("üèÜ MELHOR RESULTADO")
logger.info(f"Modelo: {best['model']}")
logger.info(f"Estrat√©gia: {best['strategy']}")
logger.info(f"Dataset: {best['dataset']}")
logger.info(f"F1-Score: {best['f1_score']:.4f}")
logger.info(f"Acur√°cia: {best['accuracy']:.4f}")
logger.info(f"Tempo m√©dio: {best['avg_inference_time']:.2f}s")

In [None]:
# Compara√ß√£o por estrat√©gia
strategy_comparison = df_report.groupby('strategy')[['accuracy', 'precision', 'recall', 'f1_score']].mean()
strategy_comparison.sort_values('f1_score', ascending=False)

In [None]:
# Compara√ß√£o por modelo
model_comparison = df_report.groupby('model')[['accuracy', 'precision', 'recall', 'f1_score']].mean()
model_comparison.sort_values('f1_score', ascending=False)

## 6. Visualiza√ß√µes

Para gerar gr√°ficos detalhados, execute o notebook `04_results_visualization.ipynb`

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Gr√°fico de F1-Score por modelo
fig, ax = plt.subplots(figsize=(12, 6))
df_report.groupby('model')['f1_score'].mean().sort_values().plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('F1-Score M√©dio por Modelo', fontsize=14, fontweight='bold')
ax.set_xlabel('F1-Score')
ax.set_ylabel('Modelo')
plt.tight_layout()
plt.show()

## Conclus√µes

Documente suas conclus√µes aqui:
- Qual modelo teve melhor desempenho?
- Qual estrat√©gia de prompting foi mais eficaz?
- Houve diferen√ßas entre os datasets?
- Qual o trade-off entre desempenho e tempo de infer√™ncia?