In [None]:

# === IMPORTS E SETUP ===
import os
import sys
import json
import torch
import numpy as np
import pandas as pd

# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Adicionar path para imports
sys.path.append('/content/drive/MyDrive/enem_tcc_resultados')

# Imports das fun√ß√µes utilit√°rias
from utils import load_enem_dataset, calcular_resultados, apply_protocol, PROTOCOL_LABELS, PROTOCOL_ORDER, generate_latex_table
from model_utils import (
    run_grid_search_all_competencies,
    train_final_models_all_competencies,
    evaluate_final_models,
    get_device
)
from config import setup_colab_paths, MODEL_TEMPLATES, TOKENIZER_NAMES, COMPETENCIES, TEST_YEARS, TRAIN_YEARS

# Setup paths
DRIVE_BASE_PATH = setup_colab_paths()
print(f"‚úì Google Drive montado. Resultados ser√£o salvos em: {DRIVE_BASE_PATH}")

# Configura√ß√µes do modelo (original - usa base model)
BASE_MODEL_NAME = "bert-base-multilingual-cased"  # mBERT base
TOKENIZER_NAME = None
MAX_LEN = 512
device = get_device()
print(f"Using device: {device}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Google Drive montado. Resultados ser√£o salvos em: /content/drive/MyDrive/enem_tcc_resultados


In [None]:
# === CARREGAR DATASET ===
df_train, df_test = load_enem_dataset(
    dataset_name="laisnuto/self-collected-ENEM-dataset",
    split="train",
    anos_teste=TEST_YEARS
)


In [None]:
# === HYPERPARAMETER SEARCH ===
hyperparameter_space = {
    'learning_rate': [1e-5],
    'batch_size': [16, 32],
    'epochs': [8, 12, 16]
}

print("Starting hyperparameter search...")
print(f"Search space: {hyperparameter_space}")

# Create directory for saving results
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_originais", "mbert_original_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"‚úì Diret√≥rio de salvamento: {SAVE_DIR}")

checkpoint_file = os.path.join(SAVE_DIR, "hyperparameter_search_checkpoint.json")
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert_original.json")

# Run grid search for all competencies (using base model)
best_hyperparams, best_qwk_scores = run_grid_search_all_competencies(
    df_train=df_train,
    hyperparameter_space=hyperparameter_space,
    model_name_template=None,
    tokenizer_name=None,
    base_model_name=BASE_MODEL_NAME,
    use_base_model=True,
    competencies=COMPETENCIES,
    max_len=MAX_LEN,
    device=device,
    checkpoint_file=checkpoint_file,
    gradient_clipping=1.0
)

# Save final results
results = {
    'best_hyperparams': best_hyperparams,
    'best_qwk_scores': best_qwk_scores
}
with open(final_results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Final results saved to {final_results_file}")

# Print summary
print("\n=== HYPERPARAMETER SEARCH RESULTS ===")
for comp in COMPETENCIES:
    comp_key = f'C{comp}'
    if comp_key in best_hyperparams:
        print(f"{comp_key}: {best_hyperparams[comp_key]} -> QWK: {best_qwk_scores[comp_key]:.3f}")
    else:
        print(f"{comp_key}: NOT COMPLETED")


In [None]:
# === TRAIN FINAL MODELS WITH BEST HYPERPARAMETERS ===
# Load best hyperparameters
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert_original.json")
try:
    with open(final_results_file, 'r') as f:
        search_results = json.load(f)
    best_hyperparams = search_results['best_hyperparams']
    print(f"‚úì Loaded hyperparameter search results from {final_results_file}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è No hyperparameter search results found. Using defaults.")
    best_hyperparams = {f'C{c}': {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5} for c in COMPETENCIES}

# Train final models (using base model)
tokenizers_final, models_final = train_final_models_all_competencies(
    df_train=df_train,
    best_hyperparams=best_hyperparams,
    model_name_template=None,
    tokenizer_name=None,
    base_model_name=BASE_MODEL_NAME,
    use_base_model=True,
    competencies=COMPETENCIES,
    save_dir=SAVE_DIR,
    model_name_prefix="mbert_original",
    max_len=MAX_LEN,
    device=device,
    gradient_clipping=1.0
)

print("\n=== Final models training completed ===")
print("Models saved in:", SAVE_DIR)


In [None]:
# === FINAL TEST EVALUATION ===
out_final = evaluate_final_models(
    df_test=df_test,
    tokenizers_final=tokenizers_final,
    models_final=models_final,
    competencies=COMPETENCIES,
    max_len=MAX_LEN,
    device=device,
    save_csv_path=os.path.join(SAVE_DIR, "predicoes_mbert_original_final_hyperopt.csv")
)

# Evaluate on test set
print("\n=== FINAL TEST SET EVALUATION ===")
test_results = {}
for c in COMPETENCIES:
    comp_key = f"C{c}"
    pred_col = f"pred_{comp_key}"
    
    pares = out_final[[comp_key, pred_col]].dropna()
    if pares.empty:
        print(f"‚ö†Ô∏è No valid data for {comp_key}")
        continue
    
    y_real = pares[comp_key].astype(int).tolist()
    y_pred = pares[pred_col].astype(int).tolist()
    
    resultado = calcular_resultados(y_real, y_pred, is_final=True)
    test_results[comp_key] = resultado
    
    print(f"\nüîé Test Results - {comp_key}")
    print(f"  Samples: {len(y_real)}")
    print(f"  QWK     : {resultado['QWK']:.3f}")
    print(f"  F1 Macro: {resultado['F1-Macro']:.3f}")
    print(f"  F1 Wghtd: {resultado['F1-Weighted']:.3f}")
    print(f"  ACC     : {resultado['ACC']:.3f}")
    print(f"  RMSE    : {resultado['RMSE']:.2f}")

# Summary
print("\n=== FINAL TEST SET SUMMARY ===")
qwk_scores = [test_results[f"C{c}"]["QWK"] for c in COMPETENCIES if f"C{c}" in test_results]
if qwk_scores:
    print(f"Average QWK across competencies: {np.mean(qwk_scores):.3f}")
    print(f"Best QWK: {max(qwk_scores):.3f}")
    print(f"Worst QWK: {min(qwk_scores):.3f}")


In [None]:
# === EVALUATE ALL PROTOCOLS ===
print("=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===")
avaliacoes_por_modelo = {"mbert-original-final": {}}

for protocol_key in PROTOCOL_ORDER:
    print(f"\n=== Esquema: {PROTOCOL_LABELS[protocol_key]} ===")
    avaliacoes = {}
    
    for c in COMPETENCIES:
        comp_key = f"C{c}"
        pred_col = f"pred_{comp_key}"
        
        pares = out_final[[comp_key, pred_col]].dropna()
        if pares.empty:
            continue
        
        y_real = pares[comp_key].astype(int).tolist()
        y_pred = pares[pred_col].astype(int).tolist()
        
        # Apply protocol
        y_r, y_p = apply_protocol(y_real, y_pred, protocol_key)
        
        if not y_r:
            continue
        
        resultado = calcular_resultados(y_r, y_p, is_final=False)
        avaliacoes[comp_key] = resultado
        
        print(f"\nüîé Avalia√ß√£o - {comp_key}")
        print(f"  QWK               : {resultado['QWK']:.3f}")
        print(f"  F1 Macro          : {resultado['F1-Macro']:.3f}")
        print(f"  F1 Weighted       : {resultado['F1-Weighted']:.3f}")
    
    avaliacoes_por_modelo["mbert-original-final"][protocol_key] = avaliacoes

# Generate LaTeX tables
print("\n=== TABELAS LaTeX ===")
metrics = [("QWK", "QWK"), ("F1-Macro", "F1 Macro"), ("F1-Weighted", "F1 Weighted")]

for met_key, met_title in metrics:
    tex = generate_latex_table(
        avaliacoes_por_modelo=avaliacoes_por_modelo,
        model_key="mbert-original-final",
        metric_key=met_key,
        metric_title=f"{met_title} por compet√™ncia para o modelo mBERT original com hyperparameter optimization",
        competencias=COMPETENCIES
    )
    print(f"\n=== Tabela LaTeX ‚Äî {met_title} ‚Äî mbert-original-final ===\n")
    print(tex)


In [15]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, root_mean_squared_error, f1_score
import numpy as np

def arredondar_notas(notas):
    referencia = [0, 40, 80, 120, 160, 200]
    novas_notas = []
    for n in notas:
        mais_prox = 1000
        arredondado = -1
        for r in referencia:
            if abs(n - r) < mais_prox:
                arredondado = r
                mais_prox = abs(n - r)
        novas_notas.append(arredondado)
    return novas_notas

def calcular_div(notas1, notas2):
    #calcula a divergencia horizontal: duas notas s√£o divergentes se a diferen√ßa entre elas √© maior que 80
    div = 0
    for n1, n2 in zip(notas1,notas2):
        if abs(n1 - n2) > 80:
            div += 1
    return 100*div/len(notas1)

def calcular_agregado(dic_perf):
    acc = dic_perf['ACC']*100
    rmse = (200 - dic_perf['RMSE'])/2
    qwk = dic_perf['QWK']*100
    div = 100 - dic_perf['DIV']
    #print(acc, rmse, qwk, div)
    return (acc + rmse + qwk + div)/4

def calcular_resultados(y, y_hat, is_final=False):
    if is_final:
        ALL_LABELS = list(range(0, 1001, 40))
    else:
        ALL_LABELS = list(range(0, 201, 40))

    ACC = accuracy_score(y, y_hat)
    RMSE = root_mean_squared_error(y, y_hat)
    QWK = cohen_kappa_score(y, y_hat, weights='quadratic', labels=ALL_LABELS)
    DIV = calcular_div(y, y_hat)
    macro_f1 = f1_score(y, y_hat, average="macro", zero_division=0)
    weighted_f1 = f1_score(y, y_hat, average="weighted", zero_division=0)
    dic = {
        'ACC': ACC,
        'RMSE': RMSE,
        'QWK': QWK,
        'DIV': DIV,
        'F1-Macro': macro_f1,
        'F1-Weighted': weighted_f1,
        'y': y,
        'y_hat': y_hat,
        'Agregado': calcular_agregado({'ACC': ACC, 'RMSE': RMSE, 'QWK': QWK, 'DIV': DIV}),
    }

    return dic


In [16]:
import ast
import pandas as pd
from datasets import load_dataset

print("Carregando o dataset...")
dataset = load_dataset("laisnuto/self-collected-ENEM-dataset", split="train")
df = dataset.to_pandas()
df.head()
print(f"Tamanho do dataset: {df.shape[0]} linhas e {df.shape[1]} colunas")

TEXT_COL = "texto"
YEAR_COL = "ano"


def _to_list(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        return [None]*5

notas_expandidas = df["notas"].apply(_to_list)
df[["C1","C2","C3","C4","C5"]] = pd.DataFrame(notas_expandidas.tolist(), index=df.index)


for c in ["C1","C2","C3","C4","C5"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
df[YEAR_COL] = pd.to_numeric(df[YEAR_COL], errors="coerce").astype("Int64")


assert TEXT_COL in df.columns and YEAR_COL in df.columns, "Colunas de texto/ano n√£o encontradas."
for c in ["C1","C2","C3","C4","C5"]:
    assert c in df.columns, f"Coluna {c} n√£o foi criada corretamente."

# Check year distribution
print("Distribui√ß√£o por ano:")
year_counts = df[YEAR_COL].value_counts().sort_index()
print(year_counts)
print(f"Total: {year_counts.sum()}")

# === MANUAL SPLIT - Definir anos de teste manualmente ===
# Anos de teste: 2016, 2018, 2022 (total: 1+11+15+16 = 43 amostras)
# Anos de treino: 2019, 2020, 2021, 2023, 2024 (total: 31+29+29+25 = 114 amostras)
anos_teste = [2016, 2018, 2022, 2023]
anos_treino = [2019, 2020, 2021, 2024]


print("Anos no treino:", sorted(anos_treino))
print("Anos no teste :", sorted(anos_teste))

# Verificar distribui√ß√£o
n_treino = int(df[YEAR_COL].isin(anos_treino).sum())
n_teste = int(df[YEAR_COL].isin(anos_teste).sum())
print(f"Tamanho treino/teste: {n_treino} / {n_teste}")
print(f"Test percentage: {n_teste/len(df)*100:.1f}%")

# Criar splits
df_train = df[df[YEAR_COL].isin(anos_treino)].reset_index(drop=True)
df_test  = df[df[YEAR_COL].isin(anos_teste)].reset_index(drop=True)

print(f"Train size: {len(df_train)}, Test size: {len(df_test)}")
print(f"Test percentage: {len(df_test)/len(df)*100:.1f}%")



Carregando o dataset...
Tamanho do dataset: 157 linhas e 4 colunas
Distribui√ß√£o por ano:
ano
2016     1
2018    15
2019    31
2020    29
2021    29
2022    16
2023    11
2024    25
Name: count, dtype: Int64
Total: 157
Anos no treino: [2019, 2020, 2021, 2024]
Anos no teste : [2016, 2018, 2022, 2023]
Tamanho treino/teste: 114 / 43
Test percentage: 27.4%
Train size: 114, Test size: 43
Test percentage: 27.4%


In [17]:
# === HYPERPARAMETER SEARCH WITH CROSS-VALIDATION ===
from sklearn.model_selection import KFold
import itertools
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import json
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Define base model (original, not fine-tuned)
BASE_MODEL_NAME = "google-bert/bert-base-multilingual-cased"  # Original mBERT model
MAX_LEN = 512
TEXT_COL = "texto"

# GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Score mapping (same as in training cell)
SCORES = [0, 40, 80, 120, 160, 200]
score_to_class = {s:i for i,s in enumerate(SCORES)}
class_to_score = {i:s for i,s in enumerate(SCORES)}

# Helper functions
def round_to_nearest_40(x):
    # usado s√≥ para TREINO: converte nota (m√∫ltiplos de 20) para a classe mais pr√≥xima (m√∫ltiplos de 40)
    x = int(x)
    return SCORES[int(np.argmin([abs(x - s) for s in SCORES]))]

def to_class(y_score):  # 0..200 -> classe 0..5
    return score_to_class[round_to_nearest_40(y_score)]

# Dataset class
class EnemCompDataset(Dataset):
    def __init__(self, df, comp_col, tokenizer, for_train=True, max_len=512):
        self.texts = df[TEXT_COL].astype(str).tolist()
        self.tokenizer = tokenizer
        self.for_train = for_train
        self.max_len = max_len
        if for_train:
            self.labels = [to_class(v) for v in df[comp_col].tolist()]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        if self.for_train:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def train_model_cv(df_train, comp_idx, hyperparams, cv_folds=None):
    """
    Train model with cross-validation for hyperparameter search
    Each fold is a different year (leave-one-year-out validation)
    Returns average QWK across folds
    """
    comp_col = f"C{comp_idx}"

    # Load tokenizer and base model (original, not fine-tuned)
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    # Create model with classification head from scratch
    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_NAME, num_labels=6).to(device)

    # Get unique years in training data
    unique_years = sorted(df_train[YEAR_COL].unique())
    print(f"  Available years for CV: {unique_years}")

    # Use all years as folds (leave-one-year-out)
    if cv_folds is None:
        cv_folds = len(unique_years)

    qwk_scores = []

    for fold, val_year in enumerate(unique_years[:cv_folds]):
        print(f"  Fold {fold+1}/{cv_folds} - Validation year: {val_year}")

        # Create fold datasets: train on all years except val_year
        df_fold_train = df_train[df_train[YEAR_COL] != val_year].reset_index(drop=True)
        df_fold_val = df_train[df_train[YEAR_COL] == val_year].reset_index(drop=True)

        print(f"    Train size: {len(df_fold_train)}, Val size: {len(df_fold_val)}")

        # Skip if validation set is too small
        if len(df_fold_val) < 2:
            print(f"    Skipping fold {fold+1} - validation set too small ({len(df_fold_val)} samples)")
            continue

        # Create datasets
        train_ds_fold = EnemCompDataset(df_fold_train, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)
        val_ds_fold = EnemCompDataset(df_fold_val, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)

        # Data loaders
        train_loader = DataLoader(train_ds_fold, batch_size=hyperparams['batch_size'], shuffle=True)
        val_loader = DataLoader(val_ds_fold, batch_size=hyperparams['batch_size'], shuffle=False)

        # Initialize model for this fold (from base model)
        model_fold = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_NAME, num_labels=6).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model_fold.parameters(), lr=hyperparams['learning_rate'])

        # Training
        model_fold.train()
        for epoch in range(hyperparams['epochs']):
            for batch in train_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                optimizer.zero_grad()

                outputs = model_fold(**batch)
                loss = criterion(outputs.logits, batch["labels"])
                loss.backward()

                # Gradient clipping to prevent exploding gradients (important for base models)
                torch.nn.utils.clip_grad_norm_(model_fold.parameters(), max_norm=1.0)

                optimizer.step()

        # Validation
        model_fold.eval()
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model_fold(**batch)
                preds = torch.argmax(outputs.logits, dim=-1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(batch["labels"].cpu().numpy())

        # Convert predictions to scores
        val_preds_scores = [class_to_score[p] for p in val_preds]
        val_labels_scores = [class_to_score[l] for l in val_labels]

        # Debug: Check if model is predicting only one class
        unique_preds = len(set(val_preds))
        if unique_preds == 1:
            print(f"    ‚ö†Ô∏è Warning: Model predicting only class {val_preds[0]} (score: {val_preds_scores[0]})")

        # Calculate QWK
        from sklearn.metrics import cohen_kappa_score
        ALL_LABELS = list(range(0, 201, 20))
        qwk = cohen_kappa_score(val_labels_scores, val_preds_scores, weights='quadratic', labels=ALL_LABELS)
        qwk_scores.append(qwk)

        print(f"    QWK: {qwk:.3f} | Unique predictions: {unique_preds}/6 classes")

    if not qwk_scores:
        print("  Warning: No valid folds completed")
        return 0.0

    avg_qwk = np.mean(qwk_scores)
    print(f"  Average QWK: {avg_qwk:.3f}")
    return avg_qwk

# Hyperparameter search space
hyperparameter_space = {
    'learning_rate': [1e-5],
    'batch_size': [ 16, 32],
    'epochs': [8, 12, 16]
}

print("Starting hyperparameter search...")
print(f"Search space: {hyperparameter_space}")

# Create directory for saving results in Google Drive
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_originais", "mbert_original_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"‚úì Diret√≥rio de salvamento: {SAVE_DIR}")

# Store results - initialize from checkpoint if exists
checkpoint_file = os.path.join(SAVE_DIR, "hyperparameter_search_checkpoint.json")
best_hyperparams = {}
best_qwk_scores = {}

# Load checkpoint if exists
if os.path.exists(checkpoint_file):
    try:
        with open(checkpoint_file, 'r') as f:
            checkpoint_data = json.load(f)
            best_hyperparams = checkpoint_data.get('best_hyperparams', {})
            best_qwk_scores = checkpoint_data.get('best_qwk_scores', {})
            print(f"Loaded checkpoint from {checkpoint_file}")
            print(f"Found saved results for: {list(best_hyperparams.keys())}")
    except Exception as e:
        print(f"Error loading checkpoint: {e}")
        best_hyperparams = {}
        best_qwk_scores = {}

def save_checkpoint():
    """Save current progress to checkpoint file"""
    checkpoint_data = {
        'best_hyperparams': best_hyperparams,
        'best_qwk_scores': best_qwk_scores
    }
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint_data, f, indent=2)
    print(f"‚úì Checkpoint saved to {checkpoint_file}")


Using device: cuda
Starting hyperparameter search...
Search space: {'learning_rate': [1e-05], 'batch_size': [16, 32], 'epochs': [8, 12, 16]}
‚úì Diret√≥rio de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp
Loaded checkpoint from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/hyperparameter_search_checkpoint.json
Found saved results for: ['C1', 'C2', 'C3', 'C4', 'C5']


In [18]:
# === GRID SEARCH - COMPET√äNCIA C1 ===
comp_idx = 1

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C1 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}
  Best QWK: 0.491


In [19]:
# === GRID SEARCH - COMPET√äNCIA C2 ===
comp_idx = 2

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C2 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}
  Best QWK: 0.458


In [20]:
# === GRID SEARCH - COMPET√äNCIA C3 ===
comp_idx = 3

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C3 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Best QWK: 0.244


In [21]:
# === GRID SEARCH - COMPET√äNCIA C4 ===
comp_idx = 4

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C4 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}
  Best QWK: 0.387


In [22]:
# === GRID SEARCH - COMPET√äNCIA C5 ===
comp_idx = 5


# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C5 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}
  Best QWK: 0.237


In [23]:
# === CONSOLIDAR TODOS OS RESULTADOS E SALVAR JSON FINAL ===
print("\n=== HYPERPARAMETER SEARCH RESULTS ===")
for comp in [1, 2, 3, 4, 5]:
    comp_key = f'C{comp}'
    if comp_key in best_hyperparams:
        print(f"{comp_key}: {best_hyperparams[comp_key]} -> QWK: {best_qwk_scores[comp_key]:.3f}")
    else:
        print(f"{comp_key}: NOT COMPLETED")

# Save final results
results = {
    'best_hyperparams': best_hyperparams,
    'best_qwk_scores': best_qwk_scores
}

final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert_original.json")
with open(final_results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Final results saved to {final_results_file}")
print(f"‚úì Checkpoint saved to {checkpoint_file}")



=== HYPERPARAMETER SEARCH RESULTS ===
C1: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.491
C2: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.458
C3: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12} -> QWK: 0.244
C4: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.387
C5: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.237

‚úì Final results saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/hyperparameter_search_results_mbert_original.json
‚úì Checkpoint saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/hyperparameter_search_checkpoint.json


In [None]:
# === TRAIN FINAL MODELS WITH BEST HYPERPARAMETERS ===
import json
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import time

# Define save directory - Google Drive
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_originais", "mbert_original_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"‚úì Diret√≥rio de salvamento: {SAVE_DIR}")

# Load best hyperparameters from search (from the saved JSON file)
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert_original.json")
try:
    with open(final_results_file, 'r') as f:
        search_results = json.load(f)
    best_hyperparams = search_results['best_hyperparams']
    print(f"‚úì Loaded hyperparameter search results from {final_results_file}")
    print(f"  Found hyperparameters for: {list(best_hyperparams.keys())}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è No hyperparameter search results found at {final_results_file}")
    print("  Using default parameters.")
    best_hyperparams = {}
    for comp in [1, 2, 3, 4, 5]:
        best_hyperparams[f'C{comp}'] = {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5}

def treinar_modelo_final_comp(df_train, comp_idx, hyperparams):
    """
    Train final model with best hyperparameters found in search
    """
    comp_col = f"C{comp_idx}"

    print(f"[C{comp_idx}] Carregando modelo base: {BASE_MODEL_NAME}")
    # Load base model (original, not fine-tuned)
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    # Create model with classification head from scratch
    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_NAME, num_labels=6).to(device)

    train_ds = EnemCompDataset(df_train, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)

    loader_kwargs = dict(
        batch_size=hyperparams['batch_size'],
        shuffle=True,
        num_workers=2,
        pin_memory=(device.type == "cuda")
    )
    train_loader = DataLoader(train_ds, **loader_kwargs)

    print(f"[C{comp_idx}] Tamanho treino: {len(train_ds)} | Batches: {len(train_loader)}")
    print(f"[C{comp_idx}] Hyperparameters: {hyperparams}")

    if len(train_loader) == 0:
        raise RuntimeError(f"[C{comp_idx}] DataLoader de treino est√° vazio. Verifique o split e colunas.")

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['learning_rate'])
    total_steps = max(1, hyperparams['epochs'] * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=max(1, int(0.1 * total_steps)),
        num_training_steps=total_steps
    )

    scaler = GradScaler(enabled=(device.type == "cuda"))

    save_path = os.path.join(SAVE_DIR, f"mbert_original_C{comp_idx}_finetuned_com_redacoes_oficiais")

    for ep in range(1, hyperparams['epochs']+1):
        t0 = time.time()
        model.train()
        running = 0.0

        for batch in tqdm(train_loader, desc=f"[C{comp_idx}] Epoch {ep}/{hyperparams['epochs']} (final)", leave=False):
            batch = {k: v.to(device, non_blocking=True) for k,v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            with autocast(enabled=(device.type == "cuda")):
                logits = model(**batch).logits
                loss = criterion(logits, batch["labels"])

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            running += loss.item()

        train_loss = running / max(1, len(train_loader))
        print(f"[C{comp_idx}] epoch {ep}/{hyperparams['epochs']} - train loss: {train_loss:.4f} | tempo: {time.time()-t0:.1f}s")

        # Save checkpoint after each epoch
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)

    # Final save
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"[C{comp_idx}] ‚úì Modelo final salvo em: {save_path}")

    return tokenizer, model, save_path

# === Train final models for C1..C5 with best hyperparameters ===
print("Training final models with best hyperparameters...")
tokenizers_final = {}
models_final = {}

for comp_idx in [1, 2, 3, 4, 5]:
    print(f"\n=== Training Final Model ‚Äî Compet√™ncia C{comp_idx} ===")
    hyperparams = best_hyperparams.get(f'C{comp_idx}', {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5})
    tok, mdl, _ = treinar_modelo_final_comp(df_train, comp_idx, hyperparams)
    tokenizers_final[comp_idx] = tok
    models_final[comp_idx] = mdl

print("\n=== Final models training completed ===")
print("Models saved in:", SAVE_DIR)


‚úì Diret√≥rio de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp
‚úì Loaded hyperparameter search results from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/hyperparameter_search_results_mbert_original.json
  Found hyperparameters for: ['C1', 'C2', 'C3', 'C4', 'C5']
Training final models with best hyperparameters...

=== Training Final Model ‚Äî Compet√™ncia C1 ===
[C1] Carregando modelo base: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[C1] Tamanho treino: 114 | Batches: 8
[C1] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C1] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C1] epoch 1/16 - train loss: 1.8484 | tempo: 3.7s


[C1] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 2/16 - train loss: 1.6241 | tempo: 4.1s


[C1] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C1] epoch 3/16 - train loss: 1.1938 | tempo: 4.8s


[C1] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 4/16 - train loss: 0.8256 | tempo: 3.8s


[C1] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 5/16 - train loss: 0.7986 | tempo: 3.7s


[C1] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 6/16 - train loss: 0.7529 | tempo: 3.8s


[C1] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 7/16 - train loss: 0.6592 | tempo: 4.0s


[C1] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 8/16 - train loss: 0.5986 | tempo: 3.8s


[C1] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 9/16 - train loss: 0.6161 | tempo: 4.0s


[C1] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 10/16 - train loss: 0.5405 | tempo: 3.8s


[C1] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 11/16 - train loss: 0.6011 | tempo: 3.7s


[C1] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 12/16 - train loss: 0.5434 | tempo: 3.8s


[C1] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 13/16 - train loss: 0.4631 | tempo: 3.8s


[C1] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 14/16 - train loss: 0.5188 | tempo: 3.8s


[C1] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 15/16 - train loss: 0.4412 | tempo: 3.8s


[C1] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C1] epoch 16/16 - train loss: 0.4845 | tempo: 5.3s
[C1] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/mbert_original_C1_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C2 ===
[C2] Carregando modelo base: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[C2] Tamanho treino: 114 | Batches: 8
[C2] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C2] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C2] epoch 1/16 - train loss: 1.8453 | tempo: 3.7s


[C2] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 2/16 - train loss: 1.7747 | tempo: 4.4s


[C2] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 3/16 - train loss: 1.6644 | tempo: 4.1s


[C2] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 4/16 - train loss: 1.3945 | tempo: 3.8s


[C2] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 5/16 - train loss: 1.1908 | tempo: 3.8s


[C2] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 6/16 - train loss: 1.1317 | tempo: 3.9s


[C2] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 7/16 - train loss: 0.9715 | tempo: 3.7s


[C2] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 8/16 - train loss: 0.9999 | tempo: 4.1s


[C2] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 9/16 - train loss: 0.9395 | tempo: 4.2s


[C2] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 10/16 - train loss: 0.8870 | tempo: 3.9s


[C2] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 11/16 - train loss: 0.8263 | tempo: 3.8s


[C2] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 12/16 - train loss: 0.7731 | tempo: 3.8s


[C2] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 13/16 - train loss: 0.8162 | tempo: 3.8s


[C2] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 14/16 - train loss: 0.8446 | tempo: 4.0s


[C2] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 15/16 - train loss: 0.7260 | tempo: 3.8s


[C2] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 16/16 - train loss: 0.7788 | tempo: 3.7s
[C2] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/mbert_original_C2_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C3 ===
[C3] Carregando modelo base: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[C3] Tamanho treino: 114 | Batches: 8
[C3] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C3] Epoch 1/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C3] epoch 1/12 - train loss: 1.7157 | tempo: 3.8s


[C3] Epoch 2/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 2/12 - train loss: 1.6452 | tempo: 4.0s


[C3] Epoch 3/12 (final):   0%|          | 0/8 [00:15<?, ?it/s]

[C3] epoch 3/12 - train loss: 1.4431 | tempo: 19.0s


[C3] Epoch 4/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 4/12 - train loss: 1.3555 | tempo: 4.1s


[C3] Epoch 5/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 5/12 - train loss: 1.2258 | tempo: 3.8s


[C3] Epoch 6/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 6/12 - train loss: 1.1283 | tempo: 4.2s


[C3] Epoch 7/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 7/12 - train loss: 1.0712 | tempo: 4.0s


[C3] Epoch 8/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 8/12 - train loss: 1.0390 | tempo: 3.9s


[C3] Epoch 9/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 9/12 - train loss: 0.9942 | tempo: 4.0s


[C3] Epoch 10/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 10/12 - train loss: 0.9908 | tempo: 3.8s


[C3] Epoch 11/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 11/12 - train loss: 1.0018 | tempo: 3.8s


[C3] Epoch 12/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C3] epoch 12/12 - train loss: 0.9934 | tempo: 3.8s
[C3] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/mbert_original_C3_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C4 ===
[C4] Carregando modelo base: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[C4] Tamanho treino: 114 | Batches: 8
[C4] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C4] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C4] epoch 1/16 - train loss: 1.7639 | tempo: 4.2s


[C4] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 2/16 - train loss: 1.6247 | tempo: 4.5s


[C4] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 3/16 - train loss: 1.3628 | tempo: 4.0s


[C4] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 4/16 - train loss: 1.1518 | tempo: 3.8s


[C4] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 5/16 - train loss: 1.1010 | tempo: 3.8s


[C4] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 6/16 - train loss: 1.0739 | tempo: 3.8s


[C4] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 7/16 - train loss: 1.0168 | tempo: 3.9s


[C4] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 8/16 - train loss: 0.9678 | tempo: 3.8s


[C4] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 9/16 - train loss: 0.9094 | tempo: 4.2s


[C4] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 10/16 - train loss: 0.9006 | tempo: 4.1s


[C4] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 11/16 - train loss: 0.8513 | tempo: 3.9s


[C4] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 12/16 - train loss: 0.8483 | tempo: 3.8s


[C4] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C4] epoch 13/16 - train loss: 0.8197 | tempo: 5.8s


[C4] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 14/16 - train loss: 0.8108 | tempo: 4.3s


[C4] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 15/16 - train loss: 0.7977 | tempo: 4.1s


[C4] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 16/16 - train loss: 0.7885 | tempo: 3.8s
[C4] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/mbert_original_C4_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C5 ===
[C5] Carregando modelo base: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[C5] Tamanho treino: 114 | Batches: 8
[C5] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C5] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C5] epoch 1/16 - train loss: 1.7871 | tempo: 4.1s


[C5] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C5] epoch 2/16 - train loss: 1.6241 | tempo: 4.4s


[C5] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 3/16 - train loss: 1.4199 | tempo: 4.0s


[C5] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 4/16 - train loss: 1.4675 | tempo: 3.9s


[C5] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 5/16 - train loss: 1.2581 | tempo: 3.9s


[C5] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 6/16 - train loss: 1.2193 | tempo: 3.8s


[C5] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 7/16 - train loss: 1.2510 | tempo: 3.7s


[C5] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 8/16 - train loss: 1.1424 | tempo: 3.8s


[C5] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 9/16 - train loss: 1.2097 | tempo: 3.8s


[C5] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 10/16 - train loss: 1.2520 | tempo: 3.8s


[C5] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5a5c7aaac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C5] epoch 11/16 - train loss: 1.0635 | tempo: 6.6s


[C5] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 12/16 - train loss: 1.0785 | tempo: 4.4s


[C5] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 13/16 - train loss: 1.0786 | tempo: 3.9s


[C5] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 14/16 - train loss: 1.0024 | tempo: 4.1s


[C5] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 15/16 - train loss: 1.0298 | tempo: 4.2s


[C5] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 16/16 - train loss: 0.9859 | tempo: 3.9s
[C5] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/mbert_original_C5_finetuned_com_redacoes_oficiais

=== Final models training completed ===
Models saved in: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp


In [None]:
# === FINAL TEST EVALUATION ON HELD-OUT TEST SET ===
import pandas as pd
from torch.utils.data import DataLoader

# Ensure SAVE_DIR is defined (same as in training cell) - Google Drive
if 'SAVE_DIR' not in globals():
    SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_originais", "mbert_original_finetuned_by_comp")
    os.makedirs(SAVE_DIR, exist_ok=True)

def prever_scores_final(df_split, tokenizer, model, comp_idx):
    """Predict scores using final trained model"""
    comp_col = f"C{comp_idx}"
    ds = EnemCompDataset(df_split, comp_col, tokenizer=tokenizer, for_train=False, max_len=MAX_LEN)
    loader = DataLoader(ds, batch_size=16, shuffle=False)
    model.eval()
    preds_cls = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
            preds_cls.extend(preds)
    # 0..5 -> 0..200
    return [class_to_score[c] for c in preds_cls]

print("=== FINAL TEST EVALUATION ===")
print(f"Test set size: {len(df_test)}")
print(f"Test set years: {sorted(df_test[YEAR_COL].unique())}")

# Prepare test results
df_test_final = df_test.reset_index(drop=True).copy()
df_test_final["id"] = df_test_final.index

# Base CSV structure
cols_base = {"id": df_test_final["id"].values}
if YEAR_COL in df_test_final.columns:
    cols_base[YEAR_COL] = df_test_final[YEAR_COL].values
out_final = pd.DataFrame(cols_base)

# Ground truth scores
for c in [1, 2, 3, 4, 5]:
    out_final[f"C{c}"] = pd.to_numeric(df_test_final[f"C{c}"], errors="coerce").astype("Int64")

# Predictions using final models
print("\nMaking predictions with final models...")
for c in [1, 2, 3, 4, 5]:
    print(f"Predicting C{c}...")
    mask_valid = df_test_final[f"C{c}"].notna()
    preds = pd.Series([pd.NA] * len(df_test_final), dtype="Int64")

    if mask_valid.any():
        y_pred = prever_scores_final(df_test_final.loc[mask_valid], tokenizers_final[c], models_final[c], c)
        preds.loc[mask_valid] = pd.Series(y_pred, index=df_test_final.index[mask_valid], dtype="Int64")

    out_final[f"pred_C{c}"] = preds

# Save final predictions
SAVE_CSV_PATH_FINAL = os.path.join(SAVE_DIR, "predicoes_mbert_original_final_hyperopt.csv")
out_final.to_csv(SAVE_CSV_PATH_FINAL, index=False)
print(f"‚úì Final predictions saved to: {SAVE_CSV_PATH_FINAL}")

# === EVALUATION ON TEST SET ===
print("\n=== FINAL TEST SET EVALUATION ===")
print("Evaluating on held-out test set (never used for training or hyperparameter tuning)")

test_results = {}
competencies = [1, 2, 3, 4, 5]

for c in competencies:
    comp_key = f"C{c}"
    pred_col = f"pred_{comp_key}"

    # Get valid pairs
    pares = out_final[[comp_key, pred_col]].dropna()
    if pares.empty:
        print(f"‚ö†Ô∏è No valid data for {comp_key}")
        continue

    y_real = pares[comp_key].astype(int).tolist()
    y_pred = pares[pred_col].astype(int).tolist()

    # Calculate metrics
    resultado = calcular_resultados(y_real, y_pred, is_final=True)
    test_results[comp_key] = resultado

    print(f"\nüîé Test Results - {comp_key}")
    print(f"  Samples: {len(y_real)}")
    print(f"  QWK     : {resultado['QWK']:.3f}")
    print(f"  F1 Macro: {resultado['F1-Macro']:.3f}")
    print(f"  F1 Wghtd: {resultado['F1-Weighted']:.3f}")
    print(f"  ACC     : {resultado['ACC']:.3f}")
    print(f"  RMSE    : {resultado['RMSE']:.2f}")

# Summary
print("\n=== FINAL TEST SET SUMMARY ===")
qwk_scores = [test_results[f"C{c}"]["QWK"] for c in competencies if f"C{c}" in test_results]
if qwk_scores:
    print(f"Average QWK across competencies: {np.mean(qwk_scores):.3f}")
    print(f"Best QWK: {max(qwk_scores):.3f}")
    print(f"Worst QWK: {min(qwk_scores):.3f}")

# Save detailed results
results_summary = {
    'test_set_size': len(df_test),
    'test_set_years': sorted(df_test[YEAR_COL].unique().tolist()),
    'competency_results': test_results,
    'average_qwk': np.mean(qwk_scores) if qwk_scores else None
}

with open('final_test_results_mbert.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\nDetailed results saved to: final_test_results_mbert.json")
print("=== EVALUATION COMPLETED ===")


=== FINAL TEST EVALUATION ===
Test set size: 43
Test set years: [np.int64(2016), np.int64(2018), np.int64(2022), np.int64(2023)]

Making predictions with final models...
Predicting C1...
Predicting C2...
Predicting C3...
Predicting C4...
Predicting C5...
‚úì Final predictions saved to: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_originais/mbert_original_finetuned_by_comp/predicoes_mbert_original_final_hyperopt.csv

=== FINAL TEST SET EVALUATION ===
Evaluating on held-out test set (never used for training or hyperparameter tuning)

üîé Test Results - C1
  Samples: 43
  QWK     : 0.467
  F1 Macro: 0.293
  F1 Wghtd: 0.375
  ACC     : 0.488
  RMSE    : 19.29

üîé Test Results - C2
  Samples: 43
  QWK     : 0.508
  F1 Macro: 0.303
  F1 Wghtd: 0.683
  ACC     : 0.721
  RMSE    : 35.70

üîé Test Results - C3
  Samples: 43
  QWK     : 0.083
  F1 Macro: 0.108
  F1 Wghtd: 0.136
  ACC     : 0.233
  RMSE    : 32.71

üîé Test Results - C4
  Samples: 43
  QWK     : 0.733
  F1 

In [None]:
# === TABELAS LaTeX PARA RESULTADOS FINAIS COM HYPERPARAMETER OPTIMIZATION ===
import pandas as pd

# Protocolos de avalia√ß√£o
protocol_labels = {
    "no_change": "Sem ajuste de escala",
    "dup_bounds": "Corre√ß√£o dupla (baixo/cima)",
    "truth_floor40": "Arred. verdade p/ baixo (40)",
    "truth_ceil40": "Arred. verdade p/ cima (40)",
    "only_true_mult40": "Apenas verdade m√∫ltipla de 40",
}
protocol_order = ["no_change", "dup_bounds", "truth_floor40", "truth_ceil40", "only_true_mult40"]

# Fun√ß√µes auxiliares para os protocolos
def ajustar_para_correcao_dupla(y_true, y_pred):
    """
    Cen√°rio 'dup_bounds':
      - Se a verdade j√° √© m√∫ltipla de 40 -> duplica (r,r) e (p,p).
      - Caso contr√°rio -> cria (baixo, cima) para a verdade e duplica p.
    """
    y_true_adj, y_pred_adj = [], []
    for r, p in zip(y_true, y_pred):
        if pd.isna(r) or pd.isna(p):
            continue
        r = int(r); p = int(p)
        if r % 40 == 0:
            y_true_adj.extend([r, r])
            y_pred_adj.extend([p, p])
        else:
            baixo = (r // 40) * 40
            cima  = baixo + 40
            y_true_adj.extend([baixo, cima])
            y_pred_adj.extend([p, p])
    return y_true_adj, y_pred_adj

def arredonda_verdade(y_true, modo):
    """
    Arredonda as notas verdade para m√∫ltiplos de 40.
    modo: 'floor' | 'ceil' | 'none'
    """
    y_true = pd.Series(y_true).dropna().astype(int)
    if modo == 'floor':
        return (np.floor(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'ceil':
        return (np.ceil(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'none':
        return y_true.tolist()
    else:
        raise ValueError("modo inv√°lido")

def filtra_verdades_multiplas_40(y_true, y_pred):
    y_true = pd.Series(y_true).dropna().astype(int)
    y_pred = pd.Series(y_pred).dropna().astype(int)
    mask = (y_true % 40 == 0)
    y_true_f = y_true[mask].tolist()
    y_pred_f = y_pred[mask].tolist()
    return y_true_f, y_pred_f

# Avaliar todos os protocolos usando os resultados finais
print("=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===")
avaliacoes_por_modelo = {"mbert-final": {}}
resumo_qwk = []

print("\nüìä Avaliando modelo: mbert-final (com hyperparameter optimization)")

for esquema_key, esquema_desc in protocol_labels.items():
    print(f"\n=== Esquema: {esquema_desc} ===")
    avaliacoes = {}
    qwk_vals = []

    for c in [1, 2, 3, 4, 5]:
        comp_key = f"C{c}"
        pred_col = f"pred_{comp_key}"

        # usa apenas pares v√°lidos (sem NaN)
        pares = out_final[[comp_key, pred_col]].dropna()
        if pares.empty:
            print(f"‚ö†Ô∏è Nenhum dado v√°lido para {comp_key} ({esquema_desc})")
            continue

        y_real = pares[comp_key].astype(int)
        y_pred = pares[pred_col].astype(int)

        # alinhamento conforme esquema
        if esquema_key == "no_change":
            y_r = y_real.tolist()
            y_p = y_pred.tolist()

        elif esquema_key == "dup_bounds":
            y_r, y_p = ajustar_para_correcao_dupla(y_real.tolist(), y_pred.tolist())

        elif esquema_key == "truth_floor40":
            y_r = arredonda_verdade(y_real.tolist(), "floor")
            y_p = y_pred.tolist()

        elif esquema_key == "truth_ceil40":
            y_r = arredonda_verdade(y_real.tolist(), "ceil")
            y_p = y_pred.tolist()

        elif esquema_key == "only_true_mult40":
            y_r, y_p = filtra_verdades_multiplas_40(y_real.tolist(), y_pred.tolist())

        else:
            raise ValueError("Esquema desconhecido.")

        if not y_r:
            print(f"‚ö†Ô∏è Nenhum dado v√°lido para {comp_key} ({esquema_desc})")
            continue

        # calcula m√©tricas (sua fun√ß√£o j√° existente)
        resultado = calcular_resultados(y_r, y_p, is_final=False)
        avaliacoes[comp_key] = resultado
        qwk_vals.append(resultado["QWK"])

        # impress√£o por compet√™ncia
        print(f"\nüîé Avalia√ß√£o - {comp_key}")
        print(f"  QWK               : {resultado['QWK']:.3f}")
        print(f"  F1 Macro          : {resultado['F1-Macro']:.3f}")
        print(f"  F1 Weighted       : {resultado['F1-Weighted']:.3f}")

    avaliacoes_por_modelo["mbert-final"][esquema_key] = avaliacoes

    if qwk_vals:
        resumo_qwk.append({
            "Modelo": "mbert-final",
            "Esquema": esquema_desc,
            "QWK_m√©dio": float(np.mean(qwk_vals)),
        })

# ---------- ranking por QWK (entre protocolos) ----------
if resumo_qwk:
    rank = pd.DataFrame(resumo_qwk).sort_values(by=["QWK_m√©dio"], ascending=False)
    print("\nüèÜ Ranking por QWK m√©dio (entre protocolos):")
    print(rank.to_string(index=False))
else:
    print("\n‚ö†Ô∏è N√£o foi poss√≠vel compor o ranking (sem QWKs calculados).")

# === Tabelas LaTeX (uma por m√©trica) para o modelo bertugues-final ===
metrics = [
    ("QWK", "QWK"),
    ("F1-Macro", "F1 Macro"),
    ("F1-Weighted", "F1 Weighted"),
]

MODEL_KEY = "mbert-final"  # modelo com hyperparameter optimization

for met_key, met_title in metrics:
    # monta DataFrame [linhas=protocolos leg√≠veis, colunas=C1..C5]
    df_tab = pd.DataFrame(
        index=[protocol_labels[k] for k in protocol_order],
        columns=[f"C{i}" for i in [1, 2, 3, 4, 5]],
        dtype=float
    )

    protocolos = avaliacoes_por_modelo.get(MODEL_KEY, {})
    for sk in protocol_order:
        if sk not in protocolos:
            continue
        compdict = protocolos[sk]  # dict: "C1" -> m√©tricas
        for c in [1, 2, 3, 4, 5]:
            ck = f"C{c}"
            if ck in compdict and met_key in compdict[ck]:
                df_tab.loc[protocol_labels[sk], ck] = compdict[ck][met_key]

    df_print = df_tab.round(3)

    caption = f"{met_title} por compet√™ncia para o modelo mBERT com hyperparameter optimization nos diferentes protocolos de avalia√ß√£o"
    label   = f"tab:{met_key.replace('-','').replace(' ','').lower()}_mbert_final"

    print(f"\n=== Tabela LaTeX ‚Äî {met_title} ‚Äî mbert-final ===\n")
    tex = df_print.to_latex(index=True,
                            caption=caption,
                            label=label,
                            na_rep="--",
                            float_format="%.3f",
                            escape=True,
                            bold_rows=False,
                            multicolumn=True,
                            multicolumn_format='c',
                            column_format='lccccc',   # alinhar: 1 coluna da linha + 5 compet√™ncias
                            longtable=False)

    print(tex)


=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===

üìä Avaliando modelo: mbert-final (com hyperparameter optimization)

=== Esquema: Sem ajuste de escala ===

üîé Avalia√ß√£o - C1
  QWK               : 0.467
  F1 Macro          : 0.293
  F1 Weighted       : 0.375

üîé Avalia√ß√£o - C2
  QWK               : 0.508
  F1 Macro          : 0.303
  F1 Weighted       : 0.683

üîé Avalia√ß√£o - C3
  QWK               : 0.083
  F1 Macro          : 0.108
  F1 Weighted       : 0.136

üîé Avalia√ß√£o - C4
  QWK               : 0.733
  F1 Macro          : 0.228
  F1 Weighted       : 0.346

üîé Avalia√ß√£o - C5
  QWK               : 0.230
  F1 Macro          : 0.163
  F1 Weighted       : 0.335

=== Esquema: Corre√ß√£o dupla (baixo/cima) ===

üîé Avalia√ß√£o - C1
  QWK               : 0.384
  F1 Macro          : 0.442
  F1 Weighted       : 0.631

üîé Avalia√ß√£o - C2
  QWK               : 0.522
  F1 Macro          : 0.447
  F1 Weighted       : 0.748

üîé Avalia√ß√£o - C3
  QWK         