In [None]:

# === IMPORTS E SETUP ===
import os
import sys
import json
import torch
import numpy as np
import pandas as pd

# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Adicionar path para imports
sys.path.append('/content/drive/MyDrive/enem_tcc_resultados')

# Imports das fun√ß√µes utilit√°rias
from utils import load_enem_dataset, calcular_resultados, apply_protocol, PROTOCOL_LABELS, PROTOCOL_ORDER, generate_latex_table
from model_utils import (
    run_grid_search_all_competencies,
    train_final_models_all_competencies,
    evaluate_final_models,
    get_device
)
from config import setup_colab_paths, MODEL_TEMPLATES, TOKENIZER_NAMES, COMPETENCIES, TEST_YEARS, TRAIN_YEARS

# Setup paths
DRIVE_BASE_PATH = setup_colab_paths()
print(f"‚úì Google Drive montado. Resultados ser√£o salvos em: {DRIVE_BASE_PATH}")

# Configura√ß√µes do modelo
MODEL_NAME_TEMPLATE = MODEL_TEMPLATES["jbsc_finetuned_by_comp"]["mbert"]
TOKENIZER_NAME = TOKENIZER_NAMES.get("mbert", None)
MAX_LEN = 512
device = get_device()
print(f"Using device: {device}")


Mounted at /content/drive
‚úì Google Drive montado. Resultados ser√£o salvos em: /content/drive/MyDrive/enem_tcc_resultados


In [None]:
# === CARREGAR DATASET ===
df_train, df_test = load_enem_dataset(
    dataset_name="laisnuto/self-collected-ENEM-dataset",
    split="train",
    anos_teste=TEST_YEARS
)


Carregando o dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


enem_dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/157 [00:00<?, ? examples/s]

Tamanho do dataset: 157 linhas e 4 colunas
Distribui√ß√£o por ano:
ano
2016     1
2018    15
2019    31
2020    29
2021    29
2022    16
2023    11
2024    25
Name: count, dtype: Int64
Total: 157
Anos no treino: [2019, 2020, 2021, 2024]
Anos no teste : [2016, 2018, 2022, 2023]
Tamanho treino/teste: 114 / 43
Test percentage: 27.4%
Train size: 114, Test size: 43
Test percentage: 27.4%


In [None]:
# === HYPERPARAMETER SEARCH ===
hyperparameter_space = {
    'learning_rate': [1e-5],
    'batch_size': [16, 32],
    'epochs': [8, 12, 16]
}

print("Starting hyperparameter search...")
print(f"Search space: {hyperparameter_space}")

# Create directory for saving results
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "mbert_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"‚úì Diret√≥rio de salvamento: {SAVE_DIR}")

checkpoint_file = os.path.join(SAVE_DIR, "hyperparameter_search_checkpoint.json")
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert.json")

# Run grid search for all competencies
best_hyperparams, best_qwk_scores = run_grid_search_all_competencies(
    df_train=df_train,
    hyperparameter_space=hyperparameter_space,
    model_name_template=MODEL_NAME_TEMPLATE,
    tokenizer_name=TOKENIZER_NAME,
    competencies=COMPETENCIES,
    max_len=MAX_LEN,
    device=device,
    checkpoint_file=checkpoint_file
)

# Save final results
results = {
    'best_hyperparams': best_hyperparams,
    'best_qwk_scores': best_qwk_scores
}
with open(final_results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Final results saved to {final_results_file}")

# Print summary
print("\n=== HYPERPARAMETER SEARCH RESULTS ===")
for comp in COMPETENCIES:
    comp_key = f'C{comp}'
    if comp_key in best_hyperparams:
        print(f"{comp_key}: {best_hyperparams[comp_key]} -> QWK: {best_qwk_scores[comp_key]:.3f}")
    else:
        print(f"{comp_key}: NOT COMPLETED")

In [None]:
# === TRAIN FINAL MODELS WITH BEST HYPERPARAMETERS ===
# Load best hyperparameters
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert.json")
try:
    with open(final_results_file, 'r') as f:
        search_results = json.load(f)
    best_hyperparams = search_results['best_hyperparams']
    print(f"‚úì Loaded hyperparameter search results from {final_results_file}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è No hyperparameter search results found. Using defaults.")
    best_hyperparams = {f'C{c}': {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5} for c in COMPETENCIES}

# Train final models
tokenizers_final, models_final = train_final_models_all_competencies(
    df_train=df_train,
    best_hyperparams=best_hyperparams,
    model_name_template=MODEL_NAME_TEMPLATE,
    tokenizer_name=TOKENIZER_NAME,
    competencies=COMPETENCIES,
    save_dir=SAVE_DIR,
    model_name_prefix="mbert_jbsc",
    max_len=MAX_LEN,
    device=device
)

print("\n=== Final models training completed ===")
print("Models saved in:", SAVE_DIR)


Using device: cuda
Starting hyperparameter search...
Search space: {'learning_rate': [1e-05], 'batch_size': [16, 32], 'epochs': [8, 12, 16]}
‚úì Diret√≥rio de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp
Loaded checkpoint from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/hyperparameter_search_checkpoint.json
Found saved results for: ['C1', 'C2']


In [None]:
# === FINAL TEST EVALUATION ===
out_final = evaluate_final_models(
    df_test=df_test,
    tokenizers_final=tokenizers_final,
    models_final=models_final,
    competencies=COMPETENCIES,
    max_len=MAX_LEN,
    device=device,
    save_csv_path=os.path.join(SAVE_DIR, "predicoes_mbert_final_hyperopt.csv")
)

# Evaluate on test set
print("\n=== FINAL TEST SET EVALUATION ===")
test_results = {}
for c in COMPETENCIES:
    comp_key = f"C{c}"
    pred_col = f"pred_{comp_key}"
    
    pares = out_final[[comp_key, pred_col]].dropna()
    if pares.empty:
        print(f"‚ö†Ô∏è No valid data for {comp_key}")
        continue
    
    y_real = pares[comp_key].astype(int).tolist()
    y_pred = pares[pred_col].astype(int).tolist()
    
    resultado = calcular_resultados(y_real, y_pred, is_final=True)
    test_results[comp_key] = resultado
    
    print(f"\nüîé Test Results - {comp_key}")
    print(f"  Samples: {len(y_real)}")
    print(f"  QWK     : {resultado['QWK']:.3f}")
    print(f"  F1 Macro: {resultado['F1-Macro']:.3f}")
    print(f"  F1 Wghtd: {resultado['F1-Weighted']:.3f}")
    print(f"  ACC     : {resultado['ACC']:.3f}")
    print(f"  RMSE    : {resultado['RMSE']:.2f}")

# Summary
print("\n=== FINAL TEST SET SUMMARY ===")
qwk_scores = [test_results[f"C{c}"]["QWK"] for c in COMPETENCIES if f"C{c}" in test_results]
if qwk_scores:
    print(f"Average QWK across competencies: {np.mean(qwk_scores):.3f}")
    print(f"Best QWK: {max(qwk_scores):.3f}")
    print(f"Worst QWK: {min(qwk_scores):.3f}")


C1 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8}
  Best QWK: 0.402


In [None]:
# === EVALUATE ALL PROTOCOLS ===
print("=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===")
avaliacoes_por_modelo = {"mbert-final": {}}

for protocol_key in PROTOCOL_ORDER:
    print(f"\n=== Esquema: {PROTOCOL_LABELS[protocol_key]} ===")
    avaliacoes = {}
    
    for c in COMPETENCIES:
        comp_key = f"C{c}"
        pred_col = f"pred_{comp_key}"
        
        pares = out_final[[comp_key, pred_col]].dropna()
        if pares.empty:
            continue
        
        y_real = pares[comp_key].astype(int).tolist()
        y_pred = pares[pred_col].astype(int).tolist()
        
        # Apply protocol
        y_r, y_p = apply_protocol(y_real, y_pred, protocol_key)
        
        if not y_r:
            continue
        
        resultado = calcular_resultados(y_r, y_p, is_final=False)
        avaliacoes[comp_key] = resultado
        
        print(f"\nüîé Avalia√ß√£o - {comp_key}")
        print(f"  QWK               : {resultado['QWK']:.3f}")
        print(f"  F1 Macro          : {resultado['F1-Macro']:.3f}")
        print(f"  F1 Weighted       : {resultado['F1-Weighted']:.3f}")
    
    avaliacoes_por_modelo["mbert-final"][protocol_key] = avaliacoes

# Generate LaTeX tables
print("\n=== TABELAS LaTeX ===")
metrics = [("QWK", "QWK"), ("F1-Macro", "F1 Macro"), ("F1-Weighted", "F1 Weighted")]

for met_key, met_title in metrics:
    tex = generate_latex_table(
        avaliacoes_por_modelo=avaliacoes_por_modelo,
        model_key="mbert-final",
        metric_key=met_key,
        metric_title=f"{met_title} por compet√™ncia para o modelo mBERT com hyperparameter optimization",
        competencias=COMPETENCIES
    )
    print(f"\n=== Tabela LaTeX ‚Äî {met_title} ‚Äî mbert-final ===\n")
    print(tex)


In [None]:
# === GRID SEARCH - COMPET√äNCIA C2 ===
comp_idx = 2

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()


C2 already completed. Skipping.
  Best params: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Best QWK: 0.471


In [None]:
# === GRID SEARCH - COMPET√äNCIA C3 ===
comp_idx = 3

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()



=== Searching hyperparameters for C3 ===

Trying combination 1/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8}


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.380
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.409
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: 0.346
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: 0.217
  Average QWK: 0.338

Trying combination 2/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.387
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.469
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: 0.271
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: -0.122
  Average QWK: 0.251

Trying combination 3/6: {'le

In [None]:
# === GRID SEARCH - COMPET√äNCIA C4 ===
comp_idx = 4

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()



=== Searching hyperparameters for C4 ===

Trying combination 1/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8}


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.647
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.377
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: 0.252
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: 0.524
  Average QWK: 0.450

Trying combination 2/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.628
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.562
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: 0.119
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: 0.479
  Average QWK: 0.447

Trying combination 3/6: {'lea

In [None]:
# === GRID SEARCH - COMPET√äNCIA C5 ===
comp_idx = 5

# Skip if already completed
if f'C{comp_idx}' in best_hyperparams:
    print(f"C{comp_idx} already completed. Skipping.")
    print(f"  Best params: {best_hyperparams[f'C{comp_idx}']}")
    print(f"  Best QWK: {best_qwk_scores[f'C{comp_idx}']:.3f}")
else:
    print(f"\n=== Searching hyperparameters for C{comp_idx} ===")

    best_qwk = -1
    best_params = None

    # Generate all combinations
    param_combinations = list(itertools.product(*hyperparameter_space.values()))
    param_names = list(hyperparameter_space.keys())

    for i, params in enumerate(param_combinations):
        hyperparams = dict(zip(param_names, params))
        print(f"\nTrying combination {i+1}/{len(param_combinations)}: {hyperparams}")

        try:
            qwk = train_model_cv(df_train, comp_idx, hyperparams)

            if qwk > best_qwk:
                best_qwk = qwk
                best_params = hyperparams.copy()

        except Exception as e:
            print(f"  Error: {e}")
            continue

    best_hyperparams[f'C{comp_idx}'] = best_params
    best_qwk_scores[f'C{comp_idx}'] = best_qwk

    print(f"\nBest hyperparameters for C{comp_idx}:")
    print(f"  Params: {best_params}")
    print(f"  QWK: {best_qwk:.3f}")

    # Save checkpoint after each competency
    save_checkpoint()



=== Searching hyperparameters for C5 ===

Trying combination 1/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8}


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.000
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.000
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: 0.000
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: 0.088
  Average QWK: 0.022

Trying combination 2/6: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}
  Available years for CV: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2024)]
  Fold 1/4 - Validation year: 2019
    Train size: 83, Val size: 31
    QWK: 0.000
  Fold 2/4 - Validation year: 2020
    Train size: 85, Val size: 29
    QWK: 0.000
  Fold 3/4 - Validation year: 2021
    Train size: 85, Val size: 29
    QWK: -0.140
  Fold 4/4 - Validation year: 2024
    Train size: 89, Val size: 25
    QWK: -0.056
  Average QWK: -0.049

Trying combination 3/6: {'

In [None]:

print("\n=== HYPERPARAMETER SEARCH RESULTS ===")
for comp in [1, 2, 3, 4, 5]:
    comp_key = f'C{comp}'
    if comp_key in best_hyperparams:
        print(f"{comp_key}: {best_hyperparams[comp_key]} -> QWK: {best_qwk_scores[comp_key]:.3f}")
    else:
        print(f"{comp_key}: NOT COMPLETED")

# Save final results
results = {
    'best_hyperparams': best_hyperparams,
    'best_qwk_scores': best_qwk_scores
}

final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert.json")
with open(final_results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Final results saved to {final_results_file}")
print(f"‚úì Checkpoint saved to {checkpoint_file}")



=== HYPERPARAMETER SEARCH RESULTS ===
C1: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8} -> QWK: 0.402
C2: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12} -> QWK: 0.471
C3: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 12} -> QWK: 0.339
C4: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.522
C5: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16} -> QWK: 0.234

‚úì Final results saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/hyperparameter_search_results_mbert.json
‚úì Checkpoint saved to /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/hyperparameter_search_checkpoint.json


In [None]:
# === TRAIN FINAL MODELS WITH BEST HYPERPARAMETERS ===
import json
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import time

# Define save directory - Google Drive
SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "mbert_finetuned_by_comp")
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"‚úì Diret√≥rio de salvamento: {SAVE_DIR}")

# Load best hyperparameters from search (from the saved JSON file)
final_results_file = os.path.join(SAVE_DIR, "hyperparameter_search_results_mbert.json")
try:
    with open(final_results_file, 'r') as f:
        search_results = json.load(f)
    best_hyperparams = search_results['best_hyperparams']
    print(f"‚úì Loaded hyperparameter search results from {final_results_file}")
    print(f"  Found hyperparameters for: {list(best_hyperparams.keys())}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è No hyperparameter search results found at {final_results_file}")
    print("  Using default parameters.")
    best_hyperparams = {}
    for comp in [1, 2, 3, 4, 5]:
        best_hyperparams[f'C{comp}'] = {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5}

def treinar_modelo_final_comp(df_train, comp_idx, hyperparams):
    """
    Train final model with best hyperparameters found in search
    """
    comp_col = f"C{comp_idx}"
    model_name = MODEL_NAME_TEMPLATE.format(comp_idx)

    print(f"[C{comp_idx}] Carregando {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

    train_ds = EnemCompDataset(df_train, comp_col, tokenizer, for_train=True, max_len=MAX_LEN)

    loader_kwargs = dict(
        batch_size=hyperparams['batch_size'],
        shuffle=True,
        num_workers=2,
        pin_memory=(device.type == "cuda")
    )
    train_loader = DataLoader(train_ds, **loader_kwargs)

    print(f"[C{comp_idx}] Tamanho treino: {len(train_ds)} | Batches: {len(train_loader)}")
    print(f"[C{comp_idx}] Hyperparameters: {hyperparams}")

    if len(train_loader) == 0:
        raise RuntimeError(f"[C{comp_idx}] DataLoader de treino est√° vazio. Verifique o split e colunas.")

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['learning_rate'])
    total_steps = max(1, hyperparams['epochs'] * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=max(1, int(0.1 * total_steps)),
        num_training_steps=total_steps
    )

    scaler = GradScaler(enabled=(device.type == "cuda"))

    save_path = os.path.join(SAVE_DIR, f"mbert_jbsc_C{comp_idx}_finetuned_com_redacoes_oficiais")

    for ep in range(1, hyperparams['epochs']+1):
        t0 = time.time()
        model.train()
        running = 0.0

        for batch in tqdm(train_loader, desc=f"[C{comp_idx}] Epoch {ep}/{hyperparams['epochs']} (final)", leave=False):
            batch = {k: v.to(device, non_blocking=True) for k,v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            with autocast(enabled=(device.type == "cuda")):
                logits = model(**batch).logits
                loss = criterion(logits, batch["labels"])

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            running += loss.item()

        train_loss = running / max(1, len(train_loader))
        print(f"[C{comp_idx}] epoch {ep}/{hyperparams['epochs']} - train loss: {train_loss:.4f} | tempo: {time.time()-t0:.1f}s")

        # Save checkpoint after each epoch
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)

    # Final save
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"[C{comp_idx}] ‚úì Modelo final salvo em: {save_path}")

    return tokenizer, model, save_path

# === Train final models for C1..C5 with best hyperparameters ===
print("Training final models with best hyperparameters...")
tokenizers_final = {}
models_final = {}

for comp_idx in [1, 2, 3, 4, 5]:
    print(f"\n=== Training Final Model ‚Äî Compet√™ncia C{comp_idx} ===")
    hyperparams = best_hyperparams.get(f'C{comp_idx}', {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 5})
    tok, mdl, _ = treinar_modelo_final_comp(df_train, comp_idx, hyperparams)
    tokenizers_final[comp_idx] = tok
    models_final[comp_idx] = mdl

print("\n=== Final models training completed ===")
print("Models saved in:", SAVE_DIR)


‚úì Diret√≥rio de salvamento: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp
‚úì Loaded hyperparameter search results from /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/hyperparameter_search_results_mbert.json
  Found hyperparameters for: ['C1', 'C2', 'C3', 'C4', 'C5']
Training final models with best hyperparameters...

=== Training Final Model ‚Äî Compet√™ncia C1 ===
[C1] Carregando kamel-usp/jbcs2025_bert-base-multilingual-cased-encoder_classification-C1-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

[C1] Tamanho treino: 114 | Batches: 8
[C1] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 8}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C1] Epoch 1/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C1] epoch 1/8 - train loss: 0.7883 | tempo: 4.1s


[C1] Epoch 2/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 2/8 - train loss: 0.5620 | tempo: 3.6s


[C1] Epoch 3/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 3/8 - train loss: 0.5402 | tempo: 3.7s


[C1] Epoch 4/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 4/8 - train loss: 0.4825 | tempo: 3.7s


[C1] Epoch 5/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers

    Traceback (most recent call last):
if w.is_alive():  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__

       self._shutdown_workers()
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
       if w.is_alive():^^
 ^^ ^ ^ ^^^^^^  
   File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^^    ^^^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^  ^ ^ ^^  ^ 
   File "/usr/li

[C1] epoch 5/8 - train loss: 0.5683 | tempo: 4.6s


[C1] Epoch 6/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 6/8 - train loss: 0.4810 | tempo: 3.9s


[C1] Epoch 7/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 7/8 - train loss: 0.3909 | tempo: 3.9s


[C1] Epoch 8/8 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C1] epoch 8/8 - train loss: 0.4058 | tempo: 4.0s
[C1] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/mbert_jbsc_C1_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C2 ===
[C2] Carregando kamel-usp/jbcs2025_bert-base-multilingual-cased-encoder_classification-C2-essay_only


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:20, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

[C2] Tamanho treino: 114 | Batches: 8
[C2] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 12}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C2] Epoch 1/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C2] epoch 1/12 - train loss: 1.0729 | tempo: 3.9s


[C2] Epoch 2/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 2/12 - train loss: 0.9427 | tempo: 3.7s


[C2] Epoch 3/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 3/12 - train loss: 0.9991 | tempo: 4.0s


[C2] Epoch 4/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 4/12 - train loss: 0.9696 | tempo: 4.1s


[C2] Epoch 5/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 5/12 - train loss: 0.9033 | tempo: 3.8s


[C2] Epoch 6/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 6/12 - train loss: 0.7830 | tempo: 3.7s


[C2] Epoch 7/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 7/12 - train loss: 0.8192 | tempo: 3.7s


[C2] Epoch 8/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 8/12 - train loss: 0.7553 | tempo: 3.8s


[C2] Epoch 9/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 9/12 - train loss: 0.7484 | tempo: 3.7s


[C2] Epoch 10/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 10/12 - train loss: 0.6896 | tempo: 3.8s


[C2] Epoch 11/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 11/12 - train loss: 0.6353 | tempo: 3.9s


[C2] Epoch 12/12 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C2] epoch 12/12 - train loss: 0.6751 | tempo: 3.9s
[C2] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/mbert_jbsc_C2_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C3 ===
[C3] Carregando kamel-usp/jbcs2025_bert-base-multilingual-cased-encoder_classification-C3-essay_only
[C3] Tamanho treino: 114 | Batches: 4
[C3] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 12}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C3] Epoch 1/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C3] epoch 1/12 - train loss: 1.2643 | tempo: 3.7s


[C3] Epoch 2/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
^Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^    ^self._shutdown_workers()
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^^^    ^if w.is_alive():^

    File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
      assert self._parent_pid == os.getpid(), 'can only test a child process'  
  ^ ^ ^ ^ ^ ^  ^ ^ ^^ ^^^^^
^  File "

[C3] epoch 2/12 - train loss: 1.1139 | tempo: 4.2s


[C3] Epoch 3/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 3/12 - train loss: 1.0628 | tempo: 4.0s


[C3] Epoch 4/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 4/12 - train loss: 1.0113 | tempo: 3.5s


[C3] Epoch 5/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 5/12 - train loss: 0.9766 | tempo: 3.6s


[C3] Epoch 6/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 6/12 - train loss: 0.9127 | tempo: 3.6s


[C3] Epoch 7/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 7/12 - train loss: 0.8831 | tempo: 4.0s


[C3] Epoch 8/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C3] epoch 8/12 - train loss: 0.8781 | tempo: 5.8s


[C3] Epoch 9/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 9/12 - train loss: 0.8448 | tempo: 3.6s


[C3] Epoch 10/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 10/12 - train loss: 0.7991 | tempo: 4.1s


[C3] Epoch 11/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 11/12 - train loss: 0.8160 | tempo: 3.7s


[C3] Epoch 12/12 (final):   0%|          | 0/4 [00:00<?, ?it/s]

[C3] epoch 12/12 - train loss: 0.7948 | tempo: 3.7s
[C3] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/mbert_jbsc_C3_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C4 ===
[C4] Carregando kamel-usp/jbcs2025_bert-base-multilingual-cased-encoder_classification-C4-essay_only
[C4] Tamanho treino: 114 | Batches: 8
[C4] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C4] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C4] epoch 1/16 - train loss: 1.0265 | tempo: 3.7s


[C4] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0> 
 Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
     self._shutdown_workers()
    File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
      ^if w.is_alive():
^ ^ ^ ^ ^   ^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^^    assert self._parent_pid == os.getpid(), 'can only test a child process'^

   File "/usr/lib/pytho

[C4] epoch 2/16 - train loss: 0.9393 | tempo: 4.1s


[C4] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 3/16 - train loss: 0.9037 | tempo: 3.8s


[C4] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 4/16 - train loss: 0.8879 | tempo: 3.8s


[C4] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 5/16 - train loss: 0.7961 | tempo: 3.8s


[C4] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 6/16 - train loss: 0.8002 | tempo: 3.8s


[C4] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 7/16 - train loss: 0.7825 | tempo: 3.7s


[C4] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 8/16 - train loss: 0.7446 | tempo: 3.7s


[C4] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 9/16 - train loss: 0.7123 | tempo: 3.8s


[C4] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 10/16 - train loss: 0.6944 | tempo: 4.0s


[C4] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 11/16 - train loss: 0.6988 | tempo: 4.0s


[C4] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 12/16 - train loss: 0.6664 | tempo: 3.8s


[C4] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 13/16 - train loss: 0.6482 | tempo: 3.8s


[C4] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 14/16 - train loss: 0.5965 | tempo: 4.1s


[C4] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 15/16 - train loss: 0.6276 | tempo: 4.1s


[C4] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C4] epoch 16/16 - train loss: 0.6026 | tempo: 3.9s
[C4] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/mbert_jbsc_C4_finetuned_com_redacoes_oficiais

=== Training Final Model ‚Äî Compet√™ncia C5 ===
[C5] Carregando kamel-usp/jbcs2025_bert-base-multilingual-cased-encoder_classification-C5-essay_only
[C5] Tamanho treino: 114 | Batches: 8
[C5] Hyperparameters: {'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 16}


  scaler = GradScaler(enabled=(device.type == "cuda"))


[C5] Epoch 1/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

  with autocast(enabled=(device.type == "cuda")):


[C5] epoch 1/16 - train loss: 1.8342 | tempo: 3.9s


[C5] Epoch 2/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 2/16 - train loss: 1.7079 | tempo: 3.9s


[C5] Epoch 3/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 3/16 - train loss: 1.5576 | tempo: 4.2s


[C5] Epoch 4/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 4/16 - train loss: 1.4037 | tempo: 4.0s


[C5] Epoch 5/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 5/16 - train loss: 1.4798 | tempo: 3.8s


[C5] Epoch 6/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 6/16 - train loss: 1.3692 | tempo: 3.9s


[C5] Epoch 7/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 7/16 - train loss: 1.4079 | tempo: 3.9s


[C5] Epoch 8/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78994ffb2ac0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[C5] epoch 8/16 - train loss: 1.2437 | tempo: 5.6s


[C5] Epoch 9/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 9/16 - train loss: 1.2198 | tempo: 3.8s


[C5] Epoch 10/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 10/16 - train loss: 1.2283 | tempo: 4.1s


[C5] Epoch 11/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 11/16 - train loss: 1.2071 | tempo: 3.9s


[C5] Epoch 12/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 12/16 - train loss: 1.1882 | tempo: 3.9s


[C5] Epoch 13/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 13/16 - train loss: 1.2216 | tempo: 3.7s


[C5] Epoch 14/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 14/16 - train loss: 1.1550 | tempo: 4.1s


[C5] Epoch 15/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 15/16 - train loss: 1.2294 | tempo: 3.9s


[C5] Epoch 16/16 (final):   0%|          | 0/8 [00:00<?, ?it/s]

[C5] epoch 16/16 - train loss: 1.1561 | tempo: 4.1s
[C5] ‚úì Modelo final salvo em: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/mbert_jbsc_C5_finetuned_com_redacoes_oficiais

=== Final models training completed ===
Models saved in: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp


In [None]:
# === FINAL TEST EVALUATION ON HELD-OUT TEST SET ===
import pandas as pd
from torch.utils.data import DataLoader

# Ensure SAVE_DIR is defined (same as in training cell) - Google Drive
if 'SAVE_DIR' not in globals():
    SAVE_DIR = os.path.join(DRIVE_BASE_PATH, "fine_tuning_modelos_jbsc", "mbert_finetuned_by_comp")
    os.makedirs(SAVE_DIR, exist_ok=True)

def prever_scores_final(df_split, tokenizer, model, comp_idx):
    """Predict scores using final trained model"""
    comp_col = f"C{comp_idx}"
    ds = EnemCompDataset(df_split, comp_col, tokenizer=tokenizer, for_train=False, max_len=MAX_LEN)
    loader = DataLoader(ds, batch_size=16, shuffle=False)
    model.eval()
    preds_cls = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
            preds_cls.extend(preds)
    # 0..5 -> 0..200
    return [class_to_score[c] for c in preds_cls]

print("=== FINAL TEST EVALUATION ===")
print(f"Test set size: {len(df_test)}")
print(f"Test set years: {sorted(df_test[YEAR_COL].unique())}")

# Prepare test results
df_test_final = df_test.reset_index(drop=True).copy()
df_test_final["id"] = df_test_final.index

# Base CSV structure
cols_base = {"id": df_test_final["id"].values}
if YEAR_COL in df_test_final.columns:
    cols_base[YEAR_COL] = df_test_final[YEAR_COL].values
out_final = pd.DataFrame(cols_base)

# Ground truth scores
for c in [1, 2, 3, 4, 5]:
    out_final[f"C{c}"] = pd.to_numeric(df_test_final[f"C{c}"], errors="coerce").astype("Int64")

# Predictions using final models
print("\nMaking predictions with final models...")
for c in [1, 2, 3, 4, 5]:
    print(f"Predicting C{c}...")
    mask_valid = df_test_final[f"C{c}"].notna()
    preds = pd.Series([pd.NA] * len(df_test_final), dtype="Int64")

    if mask_valid.any():
        y_pred = prever_scores_final(df_test_final.loc[mask_valid], tokenizers_final[c], models_final[c], c)
        preds.loc[mask_valid] = pd.Series(y_pred, index=df_test_final.index[mask_valid], dtype="Int64")

    out_final[f"pred_C{c}"] = preds

# Save final predictions
SAVE_CSV_PATH_FINAL = os.path.join(SAVE_DIR, "predicoes_mbert_final_hyperopt.csv")
out_final.to_csv(SAVE_CSV_PATH_FINAL, index=False)
print(f"‚úì Final predictions saved to: {SAVE_CSV_PATH_FINAL}")

# === EVALUATION ON TEST SET ===
print("\n=== FINAL TEST SET EVALUATION ===")
print("Evaluating on held-out test set (never used for training or hyperparameter tuning)")

test_results = {}
competencies = [1, 2, 3, 4, 5]

for c in competencies:
    comp_key = f"C{c}"
    pred_col = f"pred_{comp_key}"

    # Get valid pairs
    pares = out_final[[comp_key, pred_col]].dropna()
    if pares.empty:
        print(f"‚ö†Ô∏è No valid data for {comp_key}")
        continue

    y_real = pares[comp_key].astype(int).tolist()
    y_pred = pares[pred_col].astype(int).tolist()

    # Calculate metrics
    resultado = calcular_resultados(y_real, y_pred, is_final=True)
    test_results[comp_key] = resultado

    print(f"\nüîé Test Results - {comp_key}")
    print(f"  Samples: {len(y_real)}")
    print(f"  QWK     : {resultado['QWK']:.3f}")
    print(f"  F1 Macro: {resultado['F1-Macro']:.3f}")
    print(f"  F1 Wghtd: {resultado['F1-Weighted']:.3f}")
    print(f"  ACC     : {resultado['ACC']:.3f}")
    print(f"  RMSE    : {resultado['RMSE']:.2f}")

# Summary
print("\n=== FINAL TEST SET SUMMARY ===")
qwk_scores = [test_results[f"C{c}"]["QWK"] for c in competencies if f"C{c}" in test_results]
if qwk_scores:
    print(f"Average QWK across competencies: {np.mean(qwk_scores):.3f}")
    print(f"Best QWK: {max(qwk_scores):.3f}")
    print(f"Worst QWK: {min(qwk_scores):.3f}")

# Save detailed results
results_summary = {
    'test_set_size': len(df_test),
    'test_set_years': sorted(df_test[YEAR_COL].unique().tolist()),
    'competency_results': test_results,
    'average_qwk': np.mean(qwk_scores) if qwk_scores else None
}

with open('final_test_results_mbert.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\nDetailed results saved to: final_test_results_mbert.json")
print("=== EVALUATION COMPLETED ===")


=== FINAL TEST EVALUATION ===
Test set size: 43
Test set years: [np.int64(2016), np.int64(2018), np.int64(2022), np.int64(2023)]

Making predictions with final models...
Predicting C1...
Predicting C2...
Predicting C3...
Predicting C4...
Predicting C5...
‚úì Final predictions saved to: /content/drive/MyDrive/enem_tcc_resultados/fine_tuning_modelos_jbsc/mbert_finetuned_by_comp/predicoes_mbert_final_hyperopt.csv

=== FINAL TEST SET EVALUATION ===
Evaluating on held-out test set (never used for training or hyperparameter tuning)

üîé Test Results - C1
  Samples: 43
  QWK     : 0.456
  F1 Macro: 0.223
  F1 Wghtd: 0.375
  ACC     : 0.488
  RMSE    : 21.13

üîé Test Results - C2
  Samples: 43
  QWK     : 0.464
  F1 Macro: 0.216
  F1 Wghtd: 0.630
  ACC     : 0.674
  RMSE    : 38.22

üîé Test Results - C3
  Samples: 43
  QWK     : 0.519
  F1 Macro: 0.170
  F1 Wghtd: 0.209
  ACC     : 0.279
  RMSE    : 30.35

üîé Test Results - C4
  Samples: 43
  QWK     : 0.727
  F1 Macro: 0.235
  F1 Wghtd

In [None]:
# === TABELAS LaTeX PARA RESULTADOS FINAIS COM HYPERPARAMETER OPTIMIZATION ===
import pandas as pd

# Protocolos de avalia√ß√£o
protocol_labels = {
    "no_change": "Sem ajuste de escala",
    "dup_bounds": "Corre√ß√£o dupla (baixo/cima)",
    "truth_floor40": "Arred. verdade p/ baixo (40)",
    "truth_ceil40": "Arred. verdade p/ cima (40)",
    "only_true_mult40": "Apenas verdade m√∫ltipla de 40",
}
protocol_order = ["no_change", "dup_bounds", "truth_floor40", "truth_ceil40", "only_true_mult40"]

# Fun√ß√µes auxiliares para os protocolos
def ajustar_para_correcao_dupla(y_true, y_pred):
    """
    Cen√°rio 'dup_bounds':
      - Se a verdade j√° √© m√∫ltipla de 40 -> duplica (r,r) e (p,p).
      - Caso contr√°rio -> cria (baixo, cima) para a verdade e duplica p.
    """
    y_true_adj, y_pred_adj = [], []
    for r, p in zip(y_true, y_pred):
        if pd.isna(r) or pd.isna(p):
            continue
        r = int(r); p = int(p)
        if r % 40 == 0:
            y_true_adj.extend([r, r])
            y_pred_adj.extend([p, p])
        else:
            baixo = (r // 40) * 40
            cima  = baixo + 40
            y_true_adj.extend([baixo, cima])
            y_pred_adj.extend([p, p])
    return y_true_adj, y_pred_adj

def arredonda_verdade(y_true, modo):
    """
    Arredonda as notas verdade para m√∫ltiplos de 40.
    modo: 'floor' | 'ceil' | 'none'
    """
    y_true = pd.Series(y_true).dropna().astype(int)
    if modo == 'floor':
        return (np.floor(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'ceil':
        return (np.ceil(y_true / 40) * 40).astype(int).tolist()
    elif modo == 'none':
        return y_true.tolist()
    else:
        raise ValueError("modo inv√°lido")

def filtra_verdades_multiplas_40(y_true, y_pred):
    y_true = pd.Series(y_true).dropna().astype(int)
    y_pred = pd.Series(y_pred).dropna().astype(int)
    mask = (y_true % 40 == 0)
    y_true_f = y_true[mask].tolist()
    y_pred_f = y_pred[mask].tolist()
    return y_true_f, y_pred_f

# Avaliar todos os protocolos usando os resultados finais
print("=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===")
avaliacoes_por_modelo = {"mbert-final": {}}
resumo_qwk = []

print("\nüìä Avaliando modelo: mbert-final (com hyperparameter optimization)")

for esquema_key, esquema_desc in protocol_labels.items():
    print(f"\n=== Esquema: {esquema_desc} ===")
    avaliacoes = {}
    qwk_vals = []

    for c in [1, 2, 3, 4, 5]:
        comp_key = f"C{c}"
        pred_col = f"pred_{comp_key}"

        # usa apenas pares v√°lidos (sem NaN)
        pares = out_final[[comp_key, pred_col]].dropna()
        if pares.empty:
            print(f"‚ö†Ô∏è Nenhum dado v√°lido para {comp_key} ({esquema_desc})")
            continue

        y_real = pares[comp_key].astype(int)
        y_pred = pares[pred_col].astype(int)

        # alinhamento conforme esquema
        if esquema_key == "no_change":
            y_r = y_real.tolist()
            y_p = y_pred.tolist()

        elif esquema_key == "dup_bounds":
            y_r, y_p = ajustar_para_correcao_dupla(y_real.tolist(), y_pred.tolist())

        elif esquema_key == "truth_floor40":
            y_r = arredonda_verdade(y_real.tolist(), "floor")
            y_p = y_pred.tolist()

        elif esquema_key == "truth_ceil40":
            y_r = arredonda_verdade(y_real.tolist(), "ceil")
            y_p = y_pred.tolist()

        elif esquema_key == "only_true_mult40":
            y_r, y_p = filtra_verdades_multiplas_40(y_real.tolist(), y_pred.tolist())

        else:
            raise ValueError("Esquema desconhecido.")

        if not y_r:
            print(f"‚ö†Ô∏è Nenhum dado v√°lido para {comp_key} ({esquema_desc})")
            continue

        # calcula m√©tricas (sua fun√ß√£o j√° existente)
        resultado = calcular_resultados(y_r, y_p, is_final=False)
        avaliacoes[comp_key] = resultado
        qwk_vals.append(resultado["QWK"])

        # impress√£o por compet√™ncia
        print(f"\nüîé Avalia√ß√£o - {comp_key}")
        print(f"  QWK               : {resultado['QWK']:.3f}")
        print(f"  F1 Macro          : {resultado['F1-Macro']:.3f}")
        print(f"  F1 Weighted       : {resultado['F1-Weighted']:.3f}")

    avaliacoes_por_modelo["mbert-final"][esquema_key] = avaliacoes

    if qwk_vals:
        resumo_qwk.append({
            "Modelo": "mbert-final",
            "Esquema": esquema_desc,
            "QWK_m√©dio": float(np.mean(qwk_vals)),
        })

# ---------- ranking por QWK (entre protocolos) ----------
if resumo_qwk:
    rank = pd.DataFrame(resumo_qwk).sort_values(by=["QWK_m√©dio"], ascending=False)
    print("\nüèÜ Ranking por QWK m√©dio (entre protocolos):")
    print(rank.to_string(index=False))
else:
    print("\n‚ö†Ô∏è N√£o foi poss√≠vel compor o ranking (sem QWKs calculados).")

# === Tabelas LaTeX (uma por m√©trica) para o modelo bertugues-final ===
metrics = [
    ("QWK", "QWK"),
    ("F1-Macro", "F1 Macro"),
    ("F1-Weighted", "F1 Weighted"),
]

MODEL_KEY = "mbert-final"  # modelo com hyperparameter optimization

for met_key, met_title in metrics:
    # monta DataFrame [linhas=protocolos leg√≠veis, colunas=C1..C5]
    df_tab = pd.DataFrame(
        index=[protocol_labels[k] for k in protocol_order],
        columns=[f"C{i}" for i in [1, 2, 3, 4, 5]],
        dtype=float
    )

    protocolos = avaliacoes_por_modelo.get(MODEL_KEY, {})
    for sk in protocol_order:
        if sk not in protocolos:
            continue
        compdict = protocolos[sk]  # dict: "C1" -> m√©tricas
        for c in [1, 2, 3, 4, 5]:
            ck = f"C{c}"
            if ck in compdict and met_key in compdict[ck]:
                df_tab.loc[protocol_labels[sk], ck] = compdict[ck][met_key]

    df_print = df_tab.round(3)

    caption = f"{met_title} por compet√™ncia para o modelo mBERT com hyperparameter optimization nos diferentes protocolos de avalia√ß√£o"
    label   = f"tab:{met_key.replace('-','').replace(' ','').lower()}_mbert_final"

    print(f"\n=== Tabela LaTeX ‚Äî {met_title} ‚Äî mbert-final ===\n")
    tex = df_print.to_latex(index=True,
                            caption=caption,
                            label=label,
                            na_rep="--",
                            float_format="%.3f",
                            escape=True,
                            bold_rows=False,
                            multicolumn=True,
                            multicolumn_format='c',
                            column_format='lccccc',   # alinhar: 1 coluna da linha + 5 compet√™ncias
                            longtable=False)

    print(tex)


=== AVALIANDO TODOS OS PROTOCOLOS COM MODELOS FINAIS ===

üìä Avaliando modelo: mbert-final (com hyperparameter optimization)

=== Esquema: Sem ajuste de escala ===

üîé Avalia√ß√£o - C1
  QWK               : 0.456
  F1 Macro          : 0.223
  F1 Weighted       : 0.375

üîé Avalia√ß√£o - C2
  QWK               : 0.464
  F1 Macro          : 0.216
  F1 Weighted       : 0.630

üîé Avalia√ß√£o - C3
  QWK               : 0.519
  F1 Macro          : 0.170
  F1 Weighted       : 0.209

üîé Avalia√ß√£o - C4
  QWK               : 0.727
  F1 Macro          : 0.235
  F1 Weighted       : 0.354

üîé Avalia√ß√£o - C5
  QWK               : 0.000
  F1 Macro          : 0.074
  F1 Weighted       : 0.180

=== Esquema: Corre√ß√£o dupla (baixo/cima) ===

üîé Avalia√ß√£o - C1
  QWK               : 0.386
  F1 Macro          : 0.304
  F1 Weighted       : 0.612

üîé Avalia√ß√£o - C2
  QWK               : 0.456
  F1 Macro          : 0.353
  F1 Weighted       : 0.685

üîé Avalia√ß√£o - C3
  QWK         