# 03 - Benchmark Completo

Este notebook executa o benchmark completo comparando m√∫ltiplos modelos LLM.

## Objetivos
1. Testar todos os modelos configurados
2. Comparar todas as estrat√©gias de prompting
3. Avaliar em m√∫ltiplos datasets
4. Salvar resultados em arquivos JSON

‚ö†Ô∏è **Aten√ß√£o**: Este notebook pode levar v√°rias horas para executar!

üìä **An√°lise**: Ap√≥s a execu√ß√£o, use o notebook `04_benchmark_analysis.ipynb` para analisar os resultados.

In [1]:
# Configurar path do projeto
import sys
from pathlib import Path
from loguru import logger

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

# Configurar logger
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(project_root / "reports/logs/03_full_benchmark.log", rotation="10 MB")

logger.info(f"Project root: {project_root}")

[32m2026-01-25 21:47:49.554[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mProject root: /Users/joaoroldi/Projects/tcc2[0m


In [2]:
# Imports
import json
import time
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm

# M√≥dulos do projeto
from config import MODELS, PROMPTING_STRATEGIES, DATASETS, EXPERIMENT_CONFIG, PATHS
from data.data_loader import load_dataset, prepare_test_set
from models.model_handler import ModelHandlerOllama
from models.prompts import PromptBuilder
from models.metrics import calculate_metrics, generate_report

# Verificar Ollama
try:
    import ollama
    ollama_models = ollama.list()
    # Ollama retorna ListResponse com .models (n√£o dict com 'models')
    models_list = ollama_models.models if hasattr(ollama_models, 'models') else []
    logger.info(f"‚úÖ Ollama conectado. Modelos dispon√≠veis: {len(models_list)}")
    for model in models_list:
        # Cada modelo √© um objeto com atributo .model (n√£o dict com 'name')
        model_name = model.model if hasattr(model, 'model') else str(model)
        logger.info(f"   - {model_name}")
except Exception as e:
    logger.error(f"‚ùå Erro ao conectar com Ollama: {e}")
    logger.error("   Certifique-se de que Ollama est√° instalado e rodando.")

[32m2026-01-25 21:47:52.469[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1m‚úÖ Ollama conectado. Modelos dispon√≠veis: 6[0m
[32m2026-01-25 21:47:52.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - llama3:8b-instruct-q8_0[0m
[32m2026-01-25 21:47:52.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - qwen2:7b-instruct[0m
[32m2026-01-25 21:47:52.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - hf.co/TheBloke/sabia-7B-GGUF:latest[0m
[32m2026-01-25 21:47:52.471[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - brunoconterato/Gemma-3-Gaia-PT-BR-4b-it:f16[0m
[32m2026-01-25 21:47:52.472[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - qwen2:1.5b-instruct[0m
[32m2026-01-25 21:47:52.473[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1m   - ll

## 1. Configura√ß√£o do Benchmark

In [3]:
# Configura√ß√µes
SAMPLE_SIZE = 1000  # N√∫mero de amostras por dataset
SEED = 42

# Modelos a testar (pode reduzir para teste r√°pido)
MODELS_TO_TEST = MODELS  # Descomentar para testar todos

# Estrat√©gias a testar
STRATEGIES_TO_TEST = PROMPTING_STRATEGIES

# Datasets a testar
DATASETS_TO_TEST = list(DATASETS.keys())

logger.info(f"Modelos: {len(MODELS_TO_TEST)}")
logger.info(f"Estrat√©gias: {len(STRATEGIES_TO_TEST)}")
logger.info(f"Datasets: {len(DATASETS_TO_TEST)}")
logger.info(f"Total de experimentos: {len(MODELS_TO_TEST) * len(STRATEGIES_TO_TEST) * len(DATASETS_TO_TEST)}")

[32m2026-01-25 21:47:52.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mModelos: 6[0m
[32m2026-01-25 21:47:52.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mEstrat√©gias: 3[0m
[32m2026-01-25 21:47:52.483[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mDatasets: 2[0m
[32m2026-01-25 21:47:52.483[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mTotal de experimentos: 36[0m


## 2. Fun√ß√£o de Execu√ß√£o de Experimento

In [4]:
def extract_prediction(response: str) -> int:
    """Extrai predi√ß√£o da resposta do modelo."""
    response_lower = response.lower().strip()
    
    fake_keywords = ["falsa", "fake", "falso", "mentira", "desinforma√ß√£o", "1"]
    true_keywords = ["verdadeira", "verdadeiro", "real", "ver√≠dica", "0"]
    
    for keyword in fake_keywords:
        if keyword in response_lower:
            return 1
    
    for keyword in true_keywords:
        if keyword in response_lower:
            return 0
    
    return -1


def run_experiment(model_name, strategy, dataset_name, sample_size, seed):
    """Executa um √∫nico experimento."""
    logger.info(f"Experimento: {model_name} | {strategy} | {dataset_name}")
    
    try:
        # 1. Carregar dados
        logger.info("[1/4] Carregando dataset...")
        df = load_dataset(dataset_name)
        test_df = prepare_test_set(df, sample_size=sample_size, seed=seed)
        logger.info(f"      Amostras: {len(test_df)}")
        
        # 2. Carregar modelo (Ollama)
        logger.info("[2/4] Carregando modelo via Ollama...")
        model_handler = ModelHandlerOllama(model_name)
        vram_usage = model_handler.get_vram_usage()
        logger.info(f"      Modelo Ollama: {model_handler.ollama_model}")
        logger.info(f"      Mem√≥ria estimada: {vram_usage:.2f} GB")
        
        # 3. Preparar prompts
        logger.info("[3/4] Preparando prompts...")
        prompt_builder = PromptBuilder(strategy)
        
        if strategy == "few_shot":
            examples_df = df[~df.index.isin(test_df.index)].sample(
                n=EXPERIMENT_CONFIG["few_shot_examples"],
                random_state=seed
            )
            prompt_builder.set_examples(examples_df)
        
        # 4. Executar infer√™ncia
        logger.info("[4/4] Executando infer√™ncia...")
        predictions = []
        inference_times = []
        
        for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="      "):
            prompt = prompt_builder.build_prompt(row["text"])
            
            start_time = time.time()
            response = model_handler.generate(prompt)
            end_time = time.time()
            
            prediction = extract_prediction(response)
            predictions.append(prediction)
            inference_times.append(end_time - start_time)
        
        # Calcular m√©tricas
        true_labels = test_df["label"].tolist()
        metrics = calculate_metrics(true_labels, predictions)
        
        metrics["avg_inference_time"] = sum(inference_times) / len(inference_times)
        metrics["total_inference_time"] = sum(inference_times)
        metrics["vram_usage_gb"] = vram_usage
        
        # Resultados
        results = {
            "model": model_name,
            "strategy": strategy,
            "dataset": dataset_name,
            "sample_size": sample_size,
            "timestamp": datetime.now().isoformat(),
            "metrics": metrics,
        }
        
        # Liberar mem√≥ria (Ollama gerencia automaticamente)
        model_handler.unload()
        
        logger.info(f"‚úÖ Conclu√≠do! F1-Score: {metrics['f1_score']:.4f}")
        
        return results
        
    except Exception as e:
        logger.error(f"‚ùå ERRO: {e}")
        return {
            "model": model_name,
            "strategy": strategy,
            "dataset": dataset_name,
            "error": str(e)
        }

## 3. Executar Benchmark

In [5]:
# Executar todos os experimentos
all_results = []
results_dir = Path(PATHS['results_dir'])
results_dir.mkdir(parents=True, exist_ok=True)

total_experiments = len(MODELS_TO_TEST) * len(STRATEGIES_TO_TEST) * len(DATASETS_TO_TEST)
experiment_count = 0

logger.info(f"Iniciando benchmark com {total_experiments} experimentos...")

for model_name in MODELS_TO_TEST:
    for strategy in STRATEGIES_TO_TEST:
        for dataset_name in DATASETS_TO_TEST:
            experiment_count += 1
            
            filename = f"{model_name.replace('/', '_')}_{strategy}_{dataset_name}.json"
            filepath = results_dir / filename
            
            # Verificar se j√° existe resultado de sucesso
            if filepath.exists():
                try:
                    with open(filepath, "r") as f:
                        existing_results = json.load(f)
                        
                    if "error" not in existing_results:
                        logger.info(f"[Experimento {experiment_count}/{total_experiments}] Pular: {model_name} | {strategy} | {dataset_name} (J√° conclu√≠do)")
                        all_results.append(existing_results)
                        continue
                    else:
                        logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (erro anterior): {model_name} | {strategy} | {dataset_name}")
                except (json.JSONDecodeError, Exception):
                    logger.info(f"[Experimento {experiment_count}/{total_experiments}] Re-executando (arquivo corrompido): {model_name} | {strategy} | {dataset_name}")

            logger.info(f"[Experimento {experiment_count}/{total_experiments}] Executando: {model_name} | {strategy} | {dataset_name}")
            
            results = run_experiment(
                model_name=model_name,
                strategy=strategy,
                dataset_name=dataset_name,
                sample_size=SAMPLE_SIZE,
                seed=SEED
            )
            
            all_results.append(results)
            
            # Salvar resultados parciais
            with open(filepath, "w") as f:
                json.dump(results, f, indent=2)


logger.info("‚úÖ Benchmark conclu√≠do!")

[32m2026-01-25 21:47:52.504[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mIniciando benchmark com 36 experimentos...[0m
[32m2026-01-25 21:47:52.505[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 1/36] Pular: Qwen/Qwen2-1.5B-Instruct | zero_shot | fakebr (J√° conclu√≠do)[0m
[32m2026-01-25 21:47:52.506[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 2/36] Pular: Qwen/Qwen2-1.5B-Instruct | zero_shot | fakerecogna (J√° conclu√≠do)[0m
[32m2026-01-25 21:47:52.507[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 3/36] Pular: Qwen/Qwen2-1.5B-Instruct | few_shot | fakebr (J√° conclu√≠do)[0m
[32m2026-01-25 21:47:52.508[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1m[Experimento 4/36] Pular: Qwen/Qwen2-1.5B-Instruct | few_shot | fakerecogna (J√° conclu√≠do)[0m
[32m2026-01-25 21:47:52.509[0m |

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-25 22:13:24.317[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.0215[0m
[32m2026-01-25 22:13:24.359[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 26/36] Re-executando (erro anterior): meta-llama/Meta-Llama-3-8B-Instruct | zero_shot | fakerecogna[0m
[32m2026-01-25 22:13:24.360[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 26/36] Executando: meta-llama/Meta-Llama-3-8B-Instruct | zero_shot | fakerecogna[0m
[32m2026-01-25 22:13:24.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: meta-llama/Meta-Llama-3-8B-Instruct | zero_shot | fakerecogna[0m
[32m2026-01-25 22:13:24.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-25 22:13:24.361[0m | [1mINFO    [0m | [36mdata.data_loader[0m

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-25 22:25:19.930[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.6999[0m
[32m2026-01-25 22:25:19.957[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 27/36] Re-executando (erro anterior): meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakebr[0m
[32m2026-01-25 22:25:19.958[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 27/36] Executando: meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakebr[0m
[32m2026-01-25 22:25:19.958[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakebr[0m
[32m2026-01-25 22:25:19.958[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-25 22:25:19.959[0m | [1mINFO    [0m | [36mdata.data_loader[0m:[36mload_dataset

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-25 22:50:50.634[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.0540[0m
[32m2026-01-25 22:50:50.650[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 28/36] Re-executando (erro anterior): meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakerecogna[0m
[32m2026-01-25 22:50:50.651[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 28/36] Executando: meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakerecogna[0m
[32m2026-01-25 22:50:50.651[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: meta-llama/Meta-Llama-3-8B-Instruct | few_shot | fakerecogna[0m
[32m2026-01-25 22:50:50.652[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-25 22:50:50.652[0m | [1mINFO    [0m | [36mdata.data_loader[0m:[

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-25 23:02:47.679[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.8697[0m
[32m2026-01-25 23:02:47.705[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 29/36] Re-executando (erro anterior): meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakebr[0m
[32m2026-01-25 23:02:47.706[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 29/36] Executando: meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakebr[0m
[32m2026-01-25 23:02:47.706[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakebr[0m
[32m2026-01-25 23:02:47.707[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-25 23:02:47.707[0m | [1mINFO    [0m | [36mdata.data_load

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-26 00:30:18.233[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.2342[0m
[32m2026-01-26 00:30:18.250[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 30/36] Re-executando (erro anterior): meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakerecogna[0m
[32m2026-01-26 00:30:18.250[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 30/36] Executando: meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakerecogna[0m
[32m2026-01-26 00:30:18.251[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: meta-llama/Meta-Llama-3-8B-Instruct | chain_of_thought | fakerecogna[0m
[32m2026-01-26 00:30:18.252[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-26 00:30:18.252[0m | [1mINFO    [0m | [36

      :   0%|          | 0/1000 [00:00<?, ?it/s]

[32m2026-01-26 01:43:36.418[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m85[0m - [1m‚úÖ Conclu√≠do! F1-Score: 0.7623[0m
[32m2026-01-26 01:43:36.443[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m[Experimento 31/36] Re-executando (erro anterior): lucianosb/boto-9B-it | zero_shot | fakebr[0m
[32m2026-01-26 01:43:36.443[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1m[Experimento 31/36] Executando: lucianosb/boto-9B-it | zero_shot | fakebr[0m
[32m2026-01-26 01:43:36.444[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m21[0m - [1mExperimento: lucianosb/boto-9B-it | zero_shot | fakebr[0m
[32m2026-01-26 01:43:36.444[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_experiment[0m:[36m25[0m - [1m[1/4] Carregando dataset...[0m
[32m2026-01-26 01:43:36.445[0m | [1mINFO    [0m | [36mdata.data_loader[0m:[36mload_dataset[0m:[36m32[0m - [1mCarregando dataset:

## 4. Conclus√£o

‚úÖ **Benchmark conclu√≠do!**

Os resultados foram salvos em arquivos JSON no diret√≥rio `reports/results/`.

Para analisar os resultados, execute o notebook `04_benchmark_analysis.ipynb`.

In [6]:
# Resumo dos resultados salvos
logger.info(f"‚úÖ Todos os resultados foram salvos em: {results_dir}")
logger.info(f"   Total de arquivos: {len(list(results_dir.glob('*.json')))}")
logger.info("   Execute o notebook 04_benchmark_analysis.ipynb para analisar os resultados.")

[32m2026-01-26 01:43:47.552[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1m‚úÖ Todos os resultados foram salvos em: /Users/joaoroldi/Projects/tcc2/reports[0m
[32m2026-01-26 01:43:47.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m   Total de arquivos: 36[0m
[32m2026-01-26 01:43:47.556[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m   Execute o notebook 04_benchmark_analysis.ipynb para analisar os resultados.[0m
