In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import time
import os


# === MONTAR GOOGLE DRIVE ===
from google.colab import drive
drive.mount('/content/drive')

# Define o caminho base no Google Drive para salvar os resultados
# Você pode alterar este caminho para a pasta desejada no seu Drive
DRIVE_BASE_PATH = "/content/drive/MyDrive/enem_tcc_resultados"
os.makedirs(DRIVE_BASE_PATH, exist_ok=True)
print(f"✓ Google Drive montado. Resultados serão salvos em: {DRIVE_BASE_PATH}")


Mounted at /content/drive
✓ Google Drive montado. Resultados serão salvos em: /content/drive/MyDrive/enem_tcc_resultados


In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, root_mean_squared_error, f1_score
import numpy as np

def arredondar_notas(notas):
    referencia = [0, 40, 80, 120, 160, 200]
    novas_notas = []
    for n in notas:
        mais_prox = 1000
        arredondado = -1
        for r in referencia:
            if abs(n - r) < mais_prox:
                arredondado = r
                mais_prox = abs(n - r)
        novas_notas.append(arredondado)
    return novas_notas

def calcular_div(notas1, notas2):
    #calcula a divergencia horizontal: duas notas são divergentes se a diferença entre elas é maior que 80
    div = 0
    for n1, n2 in zip(notas1,notas2):
        if abs(n1 - n2) > 80:
            div += 1
    return 100*div/len(notas1)

def calcular_agregado(dic_perf):
    acc = dic_perf['ACC']*100
    rmse = (200 - dic_perf['RMSE'])/2
    qwk = dic_perf['QWK']*100
    div = 100 - dic_perf['DIV']
    #print(acc, rmse, qwk, div)
    return (acc + rmse + qwk + div)/4

def calcular_resultados(y, y_hat, is_final=False):
    if is_final:
        ALL_LABELS = list(range(0, 1001, 20))
    else:
        ALL_LABELS = list(range(0, 201, 20))

    ACC = accuracy_score(y, y_hat)
    RMSE = root_mean_squared_error(y, y_hat)
    QWK = cohen_kappa_score(y, y_hat, weights='quadratic', labels=ALL_LABELS)
    DIV = calcular_div(y, y_hat)
    macro_f1 = f1_score(y, y_hat, average="macro", zero_division=0)
    weighted_f1 = f1_score(y, y_hat, average="weighted", zero_division=0)
    dic = {
        'ACC': ACC,
        'RMSE': RMSE,
        'QWK': QWK,
        'DIV': DIV,
        'F1-Macro': macro_f1,
        'F1-Weighted': weighted_f1,
        'y': y,
        'y_hat': y_hat,
        'Agregado': calcular_agregado({'ACC': ACC, 'RMSE': RMSE, 'QWK': QWK, 'DIV': DIV}),
    }

    return dic


In [None]:
import ast
import pandas as pd
from datasets import load_dataset

print("Carregando o dataset...")
dataset = load_dataset("laisnuto/self-collected-ENEM-dataset", split="train")
df = dataset.to_pandas()
df.head()
print(f"Tamanho do dataset: {df.shape[0]} linhas e {df.shape[1]} colunas")

TEXT_COL = "texto"
YEAR_COL = "ano"


def _to_list(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        return [None]*5

notas_expandidas = df["notas"].apply(_to_list)
df[["C1","C2","C3","C4","C5"]] = pd.DataFrame(notas_expandidas.tolist(), index=df.index)


for c in ["C1","C2","C3","C4","C5"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
df[YEAR_COL] = pd.to_numeric(df[YEAR_COL], errors="coerce").astype("Int64")


assert TEXT_COL in df.columns and YEAR_COL in df.columns, "Colunas de texto/ano não encontradas."
for c in ["C1","C2","C3","C4","C5"]:
    assert c in df.columns, f"Coluna {c} não foi criada corretamente."

# Check year distribution
print("Distribuição por ano:")
year_counts = df[YEAR_COL].value_counts().sort_index()
print(year_counts)
print(f"Total: {year_counts.sum()}")

# === MANUAL SPLIT - Definir anos de teste manualmente ===
# Anos de teste: 2016, 2018, 2022 (total: 1+11+15+16 = 43 amostras)
# Anos de treino: 2019, 2020, 2021, 2023, 2024 (total: 31+29+29+25 = 114 amostras)
anos_teste = [2016, 2018, 2022, 2023]
anos_treino = [2019, 2020, 2021, 2024]


print("Anos no treino:", sorted(anos_treino))
print("Anos no teste :", sorted(anos_teste))

# Verificar distribuição
n_treino = int(df[YEAR_COL].isin(anos_treino).sum())
n_teste = int(df[YEAR_COL].isin(anos_teste).sum())
print(f"Tamanho treino/teste: {n_treino} / {n_teste}")
print(f"Test percentage: {n_teste/len(df)*100:.1f}%")

# Criar splits
df_train = df[df[YEAR_COL].isin(anos_treino)].reset_index(drop=True)
df_test  = df[df[YEAR_COL].isin(anos_teste)].reset_index(drop=True)

print(f"Train size: {len(df_train)}, Test size: {len(df_test)}")
print(f"Test percentage: {len(df_test)/len(df)*100:.1f}%")



Carregando o dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


enem_dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/157 [00:00<?, ? examples/s]

Tamanho do dataset: 157 linhas e 4 colunas
Distribuição por ano:
ano
2016     1
2018    15
2019    31
2020    29
2021    29
2022    16
2023    11
2024    25
Name: count, dtype: Int64
Total: 157
Anos no treino: [2019, 2020, 2021, 2024]
Anos no teste : [2016, 2018, 2022, 2023]
Tamanho treino/teste: 114 / 43
Test percentage: 27.4%
Train size: 114, Test size: 43
Test percentage: 27.4%


In [None]:
# === HYPERPARAMETER SEARCH WITH CROSS-VALIDATION ===
from sklearn.model_selection import KFold
import itertools
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import json
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Define model template and device (same as in training cell)
MODEL_NAME_TEMPLATE = "kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C{}-essay_only"
MAX_LEN = 512
TEXT_COL = "texto"

# GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Score mapping (same as in training cell)
SCORES = [0, 40, 80, 120, 160, 200]
score_to_class = {s:i for i,s in enumerate(SCORES)}
class_to_score = {i:s for i,s in enumerate(SCORES)}

# Helper functions
def round_to_nearest_40(x):
    # usado só para TREINO: converte nota (múltiplos de 20) para a classe mais próxima (múltiplos de 40)
    x = int(x)
    return SCORES[int(np.argmin([abs(x - s) for s in SCORES]))]

def to_class(y_score):  # 0..200 -> classe 0..5
    return score_to_class[round_to_nearest_40(y_score)]

# Dataset class
class EnemCompDataset(Dataset):
    def __init__(self, df, comp_col, tokenizer, for_train=True, max_len=512):
        self.texts = df[TEXT_COL].astype(str).tolist()
        self.tokenizer = tokenizer
        self.for_train = for_train
        self.max_len = max_len
        if for_train:
            self.labels = [to_class(v) for v in df[comp_col].tolist()]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        if self.for_train:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def train_model_cv(df_train, comp_idx, hyperparams, cv_folds=None):
    """
    Train model with cross-validation for hyperparameter search
    Each fold is a different year (leave-one-year-out validation)
    Returns average QWK across folds
    """
    comp_col = f"C{comp_idx}"
    model_name = MODEL_NAME_TEMPLATE.format(comp_idx)

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

    # Get unique years in training data
    unique_years = sorted(df_train[YEAR_COL].unique())
    print(f"  Available years for CV: {unique_years}")

    # Use all years as folds (leave-one-year-out)
    if cv_folds is None:
        cv_folds = len(unique_years)

    qwk_scores = []

    for fold, val_year in enumerate(unique_years[:cv_folds]):
        print(f"  Fold {fold+1}/{cv_folds} - Validation year: {val_year}")

        # Create fold datasets: train on all years except val_year
        df_fold_train = df_train[df_train[YEAR_COL] != val_year].reset_index(drop=True)
        df_fold_val = df_train[df_train[YEAR_COL] == val_year].reset_index(drop=True)

        print(f"    Train size: {len(df_fold_train)}, Val size: {len(df_fold_val)}")

        # Skip if validation set is too small
        if len(df_fold_val) < 2:
            print(f"    Skipping fold {fold+1} - validation set too small ({len(df_fold_val)} samples)")
            continue

        # Create datasets
        train_ds_fold = EnemCompDataset(df_fold_train, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)
        val_ds_fold = EnemCompDataset(df_fold_val, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)

        # Data loaders
        train_loader = DataLoader(train_ds_fold, batch_size=hyperparams['batch_size'], shuffle=True)
        val_loader = DataLoader(val_ds_fold, batch_size=hyperparams['batch_size'], shuffle=False)

        # Initialize model for this fold
        model_fold = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model_fold.parameters(), lr=hyperparams['learning_rate'])

        # Training
        model_fold.train()
        for epoch in range(hyperparams['epochs']):
            for batch in train_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                optimizer.zero_grad()

                outputs = model_fold(**batch)
                loss = criterion(outputs.logits, batch["labels"])
                loss.backward()
                optimizer.step()

        # Validation
        model_fold.eval()
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model_fold(**batch)
                preds = torch.argmax(outputs.logits, dim=-1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(batch["labels"].cpu().numpy())

        # Convert predictions to scores
        val_preds_scores = [class_to_score[p] for p in val_preds]
        val_labels_scores = [class_to_score[l] for l in val_labels]

        # Calculate QWK
        from sklearn.metrics import cohen_kappa_score
        ALL_LABELS = list(range(0, 201, 20))
        qwk = cohen_kappa_score(val_labels_scores, val_preds_scores, weights='quadratic', labels=ALL_LABELS)
        qwk_scores.append(qwk)

        print(f"    QWK: {qwk:.3f}")

    if not qwk_scores:
        print("  Warning: No valid folds completed")
        return 0.0

    avg_qwk = np.mean(qwk_scores)
    print(f"  Average QWK: {avg_qwk:.3f}")
    return avg_qwk

# Hyperparameter search space
hyperparameter_space = {
    'learning_rate': [1e-5],
    'batch_size': [ 16, 32],
    'epochs': [8, 12, 16]
}

print("Starting hyperparameter search...")
print(f"Search space: {hyperparameter_space}")

# Create directory for saving results in Google Drive
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "bertugues_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"✓ Diretório de salvamento: {SAVE_DIR}")

# Store results - initialize from checkpoint if exists
checkpoint_file = os.path.join(SAVE_DIR, "hyperparameter_search_checkpoint.json")
best_hyperparams = {}
best_qwk_scores = {}

# Load checkpoint if exists
if os.path.exists(checkpoint_file):
    try:
        with open(checkpoint_file, 'r') as f:
            checkpoint_data = json.load(f)
            best_hyperparams = checkpoint_data.get('best_hyperparams', {})
            best_qwk_scores = checkpoint_data.get('best_qwk_scores', {})
            print(f"Loaded checkpoint from {checkpoint_file}")
            print(f"Found saved results for: {list(best_hyperparams.keys())}")
    except Exception as e:
        print(f"Error loading checkpoint: {e}")
        best_hyperparams = {}
        best_qwk_scores = {}

def save_checkpoint():
    """Save current progress to checkpoint file"""
    checkpoint_data = {
        'best_hyperparams': best_hyperparams,
        'best_qwk_scores': best_qwk_scores
    }
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint_data, f, indent=2)
    print(f"✓ Checkpoint saved to {checkpoint_file}")


Using device: cuda
Starting hyperparameter search...
Search space: {'learning_rate': [1e-05], 'batch_size': [16, 32], 'epochs': [8, 12, 16]}
✓ Diretório de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp
Loaded checkpoint from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/hyperparameter_search_checkpoint.json
Found saved results for: ['C1', 'C2', 'C3', 'C4', 'C5']


In [None]:
# === GRID SEARCH - COMPETÊNCIA C1 ===
comp_idx = 1

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C1 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Best QWK: 0.407


In [None]:
# === GRID SEARCH - COMPETÊNCIA C2 ===
comp_idx = 2

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C2 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Best QWK: 0.570


In [None]:
# === GRID SEARCH - COMPETÊNCIA C3 ===
comp_idx = 3

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C3 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}
  Best QWK: 0.459


In [None]:
# === GRID SEARCH - COMPETÊNCIA C4 ===
comp_idx = 4

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C4 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}
  Best QWK: 0.416


In [None]:
# === GRID SEARCH - COMPETÊNCIA C5 ===
comp_idx = 5

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C5 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}
  Best QWK: 0.484


In [None]:
# === CONSOLIDAR TODOS OS RESULTADOS E SALVAR JSON FINAL ===
print("\n=== HYPERPARAMETER SEARCH RESULTS ===")
for comp in [1, 2, 3, 4, 5]:
    comp_key = f'C{comp}'
    if comp_key in best_hyperparams:
        print(f"{comp_key}: {best_hyperparams[comp_key]} -> QWK: {best_qwk_scores[comp_key]:.3f}")
    else:
        print(f"{comp_key}: NOT COMPLETED")

# Save final results
results = {
    'best_hyperparams': best_hyperparams,
    'best_qwk_scores': best_qwk_scores
}

final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_bertugues.json")
with open(final_results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Final results saved to {final_results_file}")
print(f"✓ Checkpoint saved to {checkpoint_file}")



=== HYPERPARAMETER SEARCH RESULTS ===
C1: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12} -> QWK: 0.407
C2: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12} -> QWK: 0.570
C3: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8} -> QWK: 0.459
C4: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8} -> QWK: 0.416
C5: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8} -> QWK: 0.484

✓ Final results saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/hyperparameter_search_results_bertugues.json
✓ Checkpoint saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/hyperparameter_search_checkpoint.json


In [None]:
# === TRAIN FINAL MODELS WITH BEST HYPERPARAMETERS ===
import json
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import time

# Define save directory - Google Drive
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "bertugues_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"✓ Diretório de salvamento: {SAVE_DIR}")

# Load best hyperparameters from search (from the saved JSON file)
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_bertugues.json")
try:
    with open(final_results_file, 'r') as f:
        search_results = json.load(f)
    best_hyperparams = search_results['best_hyperparams']
    print(f"✓ Loaded hyperparameter search results from {final_results_file}")
    print(f"  Found hyperparameters for: {list(best_hyperparams.keys())}")
except FileNotFoundError:
    print(f"⚠️ No hyperparameter search results found at {final_results_file}")
    print("  Using default parameters.")
    best_hyperparams = {}
    for comp in [1, 2, 3, 4, 5]:
        best_hyperparams[f'C{comp}'] = {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5}

def treinar_modelo_final_comp(df_train, comp_idx, hyperparams):
    """
    Train final model with best hyperparameters found in search
    """
    comp_col = f"C{comp_idx}"
    model_name = MODEL_NAME_TEMPLATE.format(comp_idx)

    print(f"[C{comp_idx}] Carregando {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

    train_ds = EnemCompDataset(df_train, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)

    loader_kwargs = dict(
        batch_size=hyperparams['batch_size'],
        shuffle=True,
        num_workers=2,
        pin_memory=(device.type == "cuda")
    )
    train_loader = DataLoader(train_ds, **loader_kwargs)

    print(f"[C{comp_idx}] Tamanho treino: {len(train_ds)} | Batches: {len(train_loader)}")
    print(f"[C{comp_idx}] Hyperparameters: {hyperparams}")

    if len(train_loader) == 0:
        raise RuntimeError(f"[C{comp_idx}] DataLoader de treino está vazio. Verifique o split e colunas.")

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['learning_rate'])
    total_steps = max(1, hyperparams['epochs'] * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=max(1, int(0.1 * total_steps)),
        num_training_steps=total_steps
    )

    scaler = GradScaler(enabled=(device.type == "cuda"))

    # Nome descritivo para o modelo salvo
    save_path = os.path.join(SAVE_DIR, f"bertugues_jbsc_C{comp_idx}_finetuned_com_redacoes_oficiais")

    for ep in range(1, hyperparams['epochs']+1):
        t0 = time.time()
        model.train()
        running = 0.0

        for batch in tqdm(train_loader, desc=f"[C{comp_idx}] Epoch {ep}/{hyperparams['epochs']} (final)", leave=False):
            batch = {k: v.to(device, non_blocking=True) for k,v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            with autocast(enabled=(device.type == "cuda")):
                logits = model(**batch).logits
                loss = criterion(logits, batch["labels"])

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            running += loss.item()

        train_loss = running / max(1, len(train_loader))
        print(f"[C{comp_idx}] epoch {ep}/{hyperparams['epochs']} - train loss: {train_loss:.4f} | tempo: {time.time()-t0:.1f}s")

        # Save checkpoint after each epoch
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)

    # Final save
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"[C{comp_idx}] ✓ Modelo final salvo em: {save_path}")

    return tokenizer, model, save_path

# === Train final models for C1..C5 with best hyperparameters ===
print("Training final models with best hyperparameters...")
tokenizers_final = {}
models_final = {}

for comp_idx in [1, 2, 3, 4, 5]:
    print(f"\n=== Training Final Model — Competência C{comp_idx} ===")
    hyperparams = best_hyperparams.get(f'C{comp_idx}', {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5})
    tok, mdl, _ = treinar_modelo_final_comp(df_train, comp_idx, hyperparams)
    tokenizers_final[comp_idx] = tok
    models_final[comp_idx] = mdl

print("\n=== Final models training completed ===")
print("Models saved in:", SAVE_DIR)


✓ Diretório de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp
✓ Loaded hyperparameter search results from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/hyperparameter_search_results_bertugues.json
  Found hyperparameters for: ['C1', 'C2', 'C3', 'C4', 'C5']
Training final models with best hyperparameters...

=== Training Final Model — Competência C1 ===
[C1] Carregando kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C1-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[C1] Tamanho treino: 114 | Batches: 8
[C1] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C1] Epoch 1/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C1] epoch 1/12 - train loss: 0.8873 | tempo: 4.7s


[C1] Epoch 2/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 2/12 - train loss: 0.5630 | tempo: 3.1s


[C1] Epoch 3/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 3/12 - train loss: 0.5218 | tempo: 3.2s


[C1] Epoch 4/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 4/12 - train loss: 0.4644 | tempo: 3.1s


[C1] Epoch 5/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 5/12 - train loss: 0.3990 | tempo: 3.1s


[C1] Epoch 6/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 6/12 - train loss: 0.5598 | tempo: 3.1s


[C1] Epoch 7/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 7/12 - train loss: 0.3183 | tempo: 3.1s


[C1] Epoch 8/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 8/12 - train loss: 0.3142 | tempo: 3.2s


[C1] Epoch 9/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 9/12 - train loss: 0.2656 | tempo: 3.3s


[C1] Epoch 10/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 10/12 - train loss: 0.2548 | tempo: 3.2s


[C1] Epoch 11/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 11/12 - train loss: 0.2558 | tempo: 3.5s


[C1] Epoch 12/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d88d43b2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x7d88d43b2ac0>self._shutdown_workers()

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    if w.is_alive():
        self._shutdown_workers() 
  ^^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^    ^if w.is_alive():
  ^^  ^ ^^ ^ ^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^ ^  
   File "/usr/lib/

[C1] epoch 12/12 - train loss: 0.2416 | tempo: 4.4s
[C1] ✓ Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/bertugues_jbsc_C1_finetuned_com_redacoes_oficiais

=== Training Final Model — Competência C2 ===
[C2] Carregando kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C2-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[C2] Tamanho treino: 114 | Batches: 8
[C2] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C2] Epoch 1/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C2] epoch 1/12 - train loss: 1.1934 | tempo: 3.7s


[C2] Epoch 2/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 2/12 - train loss: 1.1908 | tempo: 3.4s


[C2] Epoch 3/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 3/12 - train loss: 1.1142 | tempo: 3.3s


[C2] Epoch 4/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 4/12 - train loss: 0.9747 | tempo: 3.3s


[C2] Epoch 5/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 5/12 - train loss: 0.9317 | tempo: 3.3s


[C2] Epoch 6/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 6/12 - train loss: 0.8640 | tempo: 3.2s


[C2] Epoch 7/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 7/12 - train loss: 0.7676 | tempo: 3.2s


[C2] Epoch 8/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 8/12 - train loss: 0.8395 | tempo: 3.3s


[C2] Epoch 9/12 (final):   0%|          | 0/8 [00:40<?, ?it/s]

[C2] epoch 9/12 - train loss: 0.7337 | tempo: 48.2s


[C2] Epoch 10/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 10/12 - train loss: 0.8370 | tempo: 3.3s


[C2] Epoch 11/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 11/12 - train loss: 0.7193 | tempo: 3.3s


[C2] Epoch 12/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 12/12 - train loss: 0.7193 | tempo: 3.3s
[C2] ✓ Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/bertugues_jbsc_C2_finetuned_com_redacoes_oficiais

=== Training Final Model — Competência C3 ===
[C3] Carregando kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C3-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[C3] Tamanho treino: 114 | Batches: 4
[C3] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C3] Epoch 1/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C3] epoch 1/8 - train loss: 1.1570 | tempo: 3.4s


[C3] Epoch 2/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 2/8 - train loss: 1.0606 | tempo: 3.8s


[C3] Epoch 3/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 3/8 - train loss: 0.9294 | tempo: 3.2s


[C3] Epoch 4/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 4/8 - train loss: 0.8361 | tempo: 3.2s


[C3] Epoch 5/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 5/8 - train loss: 0.8204 | tempo: 3.2s


[C3] Epoch 6/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 6/8 - train loss: 0.7202 | tempo: 3.3s


[C3] Epoch 7/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 7/8 - train loss: 0.7267 | tempo: 3.3s


[C3] Epoch 8/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 8/8 - train loss: 0.7389 | tempo: 3.3s
[C3] ✓ Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/bertugues_jbsc_C3_finetuned_com_redacoes_oficiais

=== Training Final Model — Competência C4 ===
[C4] Carregando kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C4-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[C4] Tamanho treino: 114 | Batches: 4
[C4] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C4] Epoch 1/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C4] epoch 1/8 - train loss: 1.1834 | tempo: 3.5s


[C4] Epoch 2/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 2/8 - train loss: 0.8815 | tempo: 3.3s


[C4] Epoch 3/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 3/8 - train loss: 0.8651 | tempo: 3.3s


[C4] Epoch 4/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 4/8 - train loss: 0.7398 | tempo: 3.3s


[C4] Epoch 5/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 5/8 - train loss: 0.6953 | tempo: 3.3s


[C4] Epoch 6/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 6/8 - train loss: 0.6428 | tempo: 3.7s


[C4] Epoch 7/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d88d43b2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d88d43b2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C4] epoch 7/8 - train loss: 0.6018 | tempo: 5.5s


[C4] Epoch 8/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C4] epoch 8/8 - train loss: 0.5907 | tempo: 3.4s
[C4] ✓ Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/bertugues_jbsc_C4_finetuned_com_redacoes_oficiais

=== Training Final Model — Competência C5 ===
[C5] Carregando kamel-usp/jbcs2025_BERTugues-base-portuguese-cased-encoder_classification-C5-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[C5] Tamanho treino: 114 | Batches: 4
[C5] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 8}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C5] Epoch 1/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C5] epoch 1/8 - train loss: 1.1751 | tempo: 3.4s


[C5] Epoch 2/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 2/8 - train loss: 1.1677 | tempo: 3.3s


[C5] Epoch 3/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 3/8 - train loss: 1.0313 | tempo: 3.3s


[C5] Epoch 4/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 4/8 - train loss: 0.8212 | tempo: 3.4s


[C5] Epoch 5/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 5/8 - train loss: 0.7935 | tempo: 3.4s


[C5] Epoch 6/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 6/8 - train loss: 0.7227 | tempo: 3.4s


[C5] Epoch 7/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 7/8 - train loss: 0.7617 | tempo: 3.4s


[C5] Epoch 8/8 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C5] epoch 8/8 - train loss: 0.6744 | tempo: 3.4s
[C5] ✓ Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/bertugues_jbsc_C5_finetuned_com_redacoes_oficiais

=== Final models training completed ===
Models saved in: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp


In [None]:
# === FINAL TEST EVALUATION ON HELD-OUT TEST SET ===
import pandas as pd
from torch.utils.data import DataLoader

# Ensure SAVE_DIR is defined (same as in training cell) - Google Drive
if 'SAVE_DIR' not in globals():
    SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "bertugues_finetuned_by_comp")
    os.makedirs(SAVE_DIR, exist_ok=True)

def prever_scores_final(df_split, tokenizer, model, comp_idx):
    """Predict scores using final trained model"""
    comp_col = f"C{comp_idx}"
    ds = EnemCompDataset(df_split, comp_col, tokenizer=tokenizer, for_train=False, max_len=MAX_LEN)
    loader = DataLoader(ds, batch_size=16, shuffle=False)
    model.eval()
    preds_cls = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
            preds_cls.extend(preds)
    # 0..5 -> 0..200
    return [class_to_score[c] for c in preds_cls]

print("=== FINAL TEST EVALUATION ===")
print(f"Test set size: {len(df_test)}")
print(f"Test set years: {sorted(df_test[YEAR_COL].unique())}")

# Prepare test results
df_test_final = df_test.reset_index(drop=True).copy()
df_test_final["id"] = df_test_final.index

# Base CSV structure
cols_base = {"id": df_test_final["id"].values}
if YEAR_COL in df_test_final.columns:
    cols_base[YEAR_COL] = df_test_final[YEAR_COL].values
out_final = pd.DataFrame(cols_base)

# Ground truth scores
for c in [1, 2, 3, 4, 5]:
    out_final[f"C{c}"] = pd.to_numeric(df_test_final[f"C{c}"], errors="coerce").astype("Int64")

# Predictions using final models
print("\nMaking predictions with final models...")
for c in [1, 2, 3, 4, 5]:
    print(f"Predicting C{c}...")
    mask_valid = df_test_final[f"C{c}"].notna()
    preds = pd.Series([pd.NA] * len(df_test_final), dtype="Int64")

    if mask_valid.any():
        y_pred = prever_scores_final(df_test_final.loc[mask_valid], tokenizers_final[c], models_final[c], c)
        preds.loc[mask_valid] = pd.Series(y_pred, index=df_test_final.index[mask_valid], dtype="Int64")

    out_final[f"pred_C{c}"] = preds

# Save final predictions
SAVE_CSV_PATH_FINAL = os.path.join(SAVE_DIR, "predicoes_bertugues_final_hyperopt.csv")
out_final.to_csv(SAVE_CSV_PATH_FINAL, index=False)
print(f"✓ Final predictions saved to: {SAVE_CSV_PATH_FINAL}")

# === EVALUATION ON TEST SET ===
print("\n=== FINAL TEST SET EVALUATION ===")
print("Evaluating on held-out test set (never used for training or hyperparameter tuning)")

test_results = {}
competencies = [1, 2, 3, 4, 5]

for c in competencies:
    comp_key = f"C{c}"
    pred_col = f"pred_{comp_key}"

    # Get valid pairs
    pares = out_final[[comp_key, pred_col]].dropna()
    if pares.empty:
        print(f"⚠️ No valid data for {comp_key}")
        continue

    y_real = pares[comp_key].astype(int).tolist()
    y_pred = pares[pred_col].astype(int).tolist()

    # Calculate metrics
    resultado = calcular_resultados(y_real, y_pred, is_final=True)
    test_results[comp_key] = resultado

    print(f"\n🔎 Test Results - {comp_key}")
    print(f"  Samples: {len(y_real)}")
    print(f"  QWK     : {resultado['QWK']:.3f}")
    print(f"  F1 Macro: {resultado['F1-Macro']:.3f}")
    print(f"  F1 Wghtd: {resultado['F1-Weighted']:.3f}")
    print(f"  ACC     : {resultado['ACC']:.3f}")
    print(f"  RMSE    : {resultado['RMSE']:.2f}")

# Summary
print("\n=== FINAL TEST SET SUMMARY ===")
qwk_scores = [test_results[f"C{c}"]["QWK"] for c in competencies if f"C{c}" in test_results]
if qwk_scores:
    print(f"Average QWK across competencies: {np.mean(qwk_scores):.3f}")
    print(f"Best QWK: {max(qwk_scores):.3f}")
    print(f"Worst QWK: {min(qwk_scores):.3f}")

# Save detailed results
results_summary = {
    'test_set_size': len(df_test),
    'test_set_years': sorted(df_test[YEAR_COL].unique().tolist()),
    'competency_results': test_results,
    'average_qwk': np.mean(qwk_scores) if qwk_scores else None
}

with open('final_test_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\nDetailed results saved to: final_test_results.json")
print("=== EVALUATION COMPLETED ===")


=== FINAL TEST EVALUATION ===
Test set size: 43
Test set years: [np.int64(2016), np.int64(2018), np.int64(2022), np.int64(2023)]

Making predictions with final models...
Predicting C1...
Predicting C2...
Predicting C3...
Predicting C4...
Predicting C5...
✓ Final predictions saved to: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/bertugues_finetuned_by_comp/predicoes_bertugues_final_hyperopt.csv

=== FINAL TEST SET EVALUATION ===
Evaluating on held-out test set (never used for training or hyperparameter tuning)

🔎 Test Results - C1
  Samples: 43
  QWK     : 0.467
  F1 Macro: 0.275
  F1 Wghtd: 0.372
  ACC     : 0.488
  RMSE    : 19.29

🔎 Test Results - C2
  Samples: 43
  QWK     : 0.449
  F1 Macro: 0.222
  F1 Wghtd: 0.625
  ACC     : 0.674
  RMSE    : 39.65

🔎 Test Results - C3
  Samples: 43
  QWK     : 0.286
  F1 Macro: 0.163
  F1 Wghtd: 0.195
  ACC     : 0.279
  RMSE    : 37.97

🔎 Test Results - C4
  Samples: 43
  QWK     : 0.685
  F1 Macro: 0.250
  F1 Wghtd: 0.37

In [None]:
# === TABELAS LaTeX PARA RESULTADOS FINAIS COM HYPERPARAMETER OPTIMIZATION ===
import pandas as pd

# Protocolos de avaliação
protocol_labels = {
    "no_change": "Sem ajuste de escala",
    "dup_bounds": "Correção dupla (baixo/cima)",
    "truth_floor40": "Arred. verdade p/ baixo (40)",
    "truth_ceil40": "Arred. verdade p/ cima (40)",
    "only_true_mult40": "Apenas verdade múltipla de 40",
}
protocol_order = ["no_change", "dup_bounds", "truth_floor40", "truth_ceil40", "only_true_mult40"]

# Funções auxiliares para os protocolos
def ajustar_para_correcao_dupla(y_true, y_pred):
    """
    Cenário 'dup_bounds':
      - Se a verdade já é múltipla de 40 -> duplica (r,r) e (p,p).
      - Caso contrário -> cria (baixo, cima) para a verdade e duplica p.
    """
    y_true_adj, y_pred_adj = [], []
    for r, p in zip(y_true, y_pred):
        if pd.isna(r) or pd.isna(p):
            continue
        r = int(r); p = int(p)
        if r % 40 == 0:
            y_true_adj.extend([r, r])
            y_pred_adj.extend([p, p])
        else:
            baixo = (r // 40) * 40
            cima  = baixo + 40
            y_true_adj.extend([baixo, cima])
            y_pred_adj.extend([p, p])
    return y_true_adj, y_pred_adj

def arredonda_verdade(y_true, modo):
    """
    Arredonda as notas verdade para múltiplos de 40.
    modo: 'floor' | 'ceil' | 'none'
    """
    y_true = pd.Series(y_true).dropna().astype(int)
    if modo == 'floor':
        return (np.floor(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'ceil':
        return (np.ceil(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'none':
        return y_true.tolist()
    else:
        raise ValueError("modo inválido")

def filtra_verdades_multiplas_40(y_true, y_pred):
    y_true = pd.Series(y_true).dropna().astype(int)
    y_pred = pd.Series(y_pred).dropna().astype(int)
    mask = (y_true % 40 == 0)
    y_true_f = y_true[mask].tolist()
    y_pred_f = y_pred[mask].tolist()
    return y_true_f, y_pred_f

# Avaliar todos os protocolos usando os resultados finais
print("=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===")
avaliacoes_por_modelo = {"bertugues-final": {}}
resumo_qwk = []

print("\n📊 Avaliando modelo: bertugues-final (com hyperparameter optimization)")

for esquema_key, esquema_desc in protocol_labels.items():
    print(f"\n=== Esquema: {esquema_desc} ===")
    avaliacoes = {}
    qwk_vals = []

    for c in [1, 2, 3, 4, 5]:
        comp_key = f"C{c}"
        pred_col = f"pred_{comp_key}"

        # usa apenas pares válidos (sem NaN)
        pares = out_final[[comp_key, pred_col]].dropna()
        if pares.empty:
            print(f"⚠️ Nenhum dado válido para {comp_key} ({esquema_desc})")
            continue

        y_real = pares[comp_key].astype(int)
        y_pred = pares[pred_col].astype(int)

        # alinhamento conforme esquema
        if esquema_key == "no_change":
            y_r = y_real.tolist()
            y_p = y_pred.tolist()

        elif esquema_key == "dup_bounds":
            y_r, y_p = ajustar_para_correcao_dupla(y_real.tolist(), y_pred.tolist())

        elif esquema_key == "truth_floor40":
            y_r = arredonda_verdade(y_real.tolist(), "floor")
            y_p = y_pred.tolist()

        elif esquema_key == "truth_ceil40":
            y_r = arredonda_verdade(y_real.tolist(), "ceil")
            y_p = y_pred.tolist()

        elif esquema_key == "only_true_mult40":
            y_r, y_p = filtra_verdades_multiplas_40(y_real.tolist(), y_pred.tolist())

        else:
            raise ValueError("Esquema desconhecido.")

        if not y_r:
            print(f"⚠️ Nenhum dado válido para {comp_key} ({esquema_desc})")
            continue

        # calcula métricas (sua função já existente)
        resultado = calcular_resultados(y_r, y_p, is_final=False)
        avaliacoes[comp_key] = resultado
        qwk_vals.append(resultado["QWK"])

        # impressão por competência
        print(f"\n🔎 Avaliação - {comp_key}")
        print(f"  QWK               : {resultado['QWK']:.3f}")
        print(f"  F1 Macro          : {resultado['F1-Macro']:.3f}")
        print(f"  F1 Weighted       : {resultado['F1-Weighted']:.3f}")

    avaliacoes_por_modelo["bertugues-final"][esquema_key] = avaliacoes

    if qwk_vals:
        resumo_qwk.append({
            "Modelo": "bertugues-final",
            "Esquema": esquema_desc,
            "QWK_médio": float(np.mean(qwk_vals)),
        })

# ---------- ranking por QWK (entre protocolos) ----------
if resumo_qwk:
    rank = pd.DataFrame(resumo_qwk).sort_values(by=["QWK_médio"], ascending=False)
    print("\n🏆 Ranking por QWK médio (entre protocolos):")
    print(rank.to_string(index=False))
else:
    print("\n⚠️ Não foi possível compor o ranking (sem QWKs calculados).")

# === Tabelas LaTeX (uma por métrica) para o modelo bertugues-final ===
metrics = [
    ("QWK", "QWK"),
    ("F1-Macro", "F1 Macro"),
    ("F1-Weighted", "F1 Weighted"),
]

MODEL_KEY = "bertugues-final"  # modelo com hyperparameter optimization

for met_key, met_title in metrics:
    # monta DataFrame [linhas=protocolos legíveis, colunas=C1..C5]
    df_tab = pd.DataFrame(
        index=[protocol_labels[k] for k in protocol_order],
        columns=[f"C{i}" for i in [1, 2, 3, 4, 5]],
        dtype=float
    )

    protocolos = avaliacoes_por_modelo.get(MODEL_KEY, {})
    for sk in protocol_order:
        if sk not in protocolos:
            continue
        compdict = protocolos[sk]  # dict: "C1" -> métricas
        for c in [1, 2, 3, 4, 5]:
            ck = f"C{c}"
            if ck in compdict and met_key in compdict[ck]:
                df_tab.loc[protocol_labels[sk], ck] = compdict[ck][met_key]

    df_print = df_tab.round(3)

    caption = f"{met_title} por competência para o modelo BERTugues com hyperparameter optimization nos diferentes protocolos de avaliação"
    label   = f"tab:{met_key.replace('-','').replace(' ','').lower()}_bertugues_final"

    print(f"\n=== Tabela LaTeX — {met_title} — bertugues-final ===\n")
    tex = df_print.to_latex(index=True,
                            caption=caption,
                            label=label,
                            na_rep="--",
                            float_format="%.3f",
                            escape=True,
                            bold_rows=False,
                            multicolumn=True,
                            multicolumn_format='c',
                            column_format='lccccc',   # alinhar: 1 coluna da linha + 5 competências
                            longtable=False)

    print(tex)


=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===

📊 Avaliando modelo: bertugues-final (com hyperparameter optimization)

=== Esquema: Sem ajuste de escala ===

🔎 Avaliação - C1
  QWK               : 0.467
  F1 Macro          : 0.275
  F1 Weighted       : 0.372

🔎 Avaliação - C2
  QWK               : 0.449
  F1 Macro          : 0.222
  F1 Weighted       : 0.625

🔎 Avaliação - C3
  QWK               : 0.286
  F1 Macro          : 0.163
  F1 Weighted       : 0.195

🔎 Avaliação - C4
  QWK               : 0.685
  F1 Macro          : 0.250
  F1 Weighted       : 0.376

🔎 Avaliação - C5
  QWK               : 0.489
  F1 Macro          : 0.215
  F1 Weighted       : 0.373

=== Esquema: Correção dupla (baixo/cima) ===

🔎 Avaliação - C1
  QWK               : 0.384
  F1 Macro          : 0.442
  F1 Weighted       : 0.631

🔎 Avaliação - C2
  QWK               : 0.442
  F1 Macro          : 0.358
  F1 Weighted       : 0.680

🔎 Avaliação - C3
  QWK               : 0.263
  F1 Macro          : 0.292