# Fine-tuning a model on a token classification task

## Loading the dataset

## Preprocessing the data

In [1]:
from transformers import AutoTokenizer
import os
import optuna
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoConfig
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np
from datasets import load_from_disk, concatenate_datasets
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

# Inicializando a vari√°vel particoes
particoes = {}
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
# atch_size = 8
print(particoes.keys())

label_all_tokens = True
# Carregar o tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Caminho correto para as parti√ß√µes
base_path = "/home/juliaribeiro/"

# Carregar as parti√ß√µes diretamente do caminho especificado
for i in range(10):
    partition_path = f"{base_path}data_division_{i}"
    print(f"Verificando parti√ß√£o: {partition_path}")  # Para depura√ß√£o
    
    if os.path.exists(partition_path):  # Verifica se o diret√≥rio existe
        try:
            particoes[f"particao_{i}"] = load_from_disk(partition_path)
        except Exception as e:
            print(f"Erro ao carregar a parti√ß√£o {partition_path}: {e}")
    else:
        print(f"Parti√ß√£o {partition_path} n√£o encontrada!")  # Alerta se o diret√≥rio n√£o existir

# Verificar quais parti√ß√µes foram carregadas corretamente
print("Parti√ß√µes carregadas:", list(particoes.keys()))


# Criar o mapeamento de label2id
label2id = {
    'O': 0,               # Fora de entidade
    'B-ORGANIZACAO': 1,    # In√≠cio da entidade ORGANIZACAO
    'I-ORGANIZACAO': 2,    # Interior da entidade ORGANIZACAO
    'B-JURISPRUDENCIA': 3, # In√≠cio da entidade JURISPRUDENCIA
    'I-JURISPRUDENCIA': 4, # Interior da entidade JURISPRUDENCIA
    'B-LOCAL': 5,          # In√≠cio da entidade LOCAL
    'I-LOCAL': 6,          # Interior da entidade LOCAL
    'B-LEGISLACAO': 7,     # In√≠cio da entidade LEGISLACAO
    'I-LEGISLACAO': 8,     # Interior da entidade LEGISLACAO
    'B-PESSOA': 9,         # In√≠cio da entidade PESSOA
    'I-PESSOA': 10,        # Interior da entidade PESSOA
    'B-TEMPO': 11,         # In√≠cio da entidade TEMPO
    'I-TEMPO': 12          # Interior da entidade TEMPO
}

# Exibir o mapeamento de labels
print(f"Label2ID mapping: {label2id}")

def tokenize_and_align_labels(examples):
    # Verificando se 'sentences' est√° presente nos dados
    if "sentences" not in examples:
        raise KeyError("A chave 'sentences' n√£o foi encontrada nos dados.")

    tokenized_inputs = []
    labels = []

    for sentence in examples["sentences"]:
        # Tokenizando os tokens de cada senten√ßa
        tokenized_sentence = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True, max_length=512)

        word_ids = tokenized_sentence.word_ids()  # Pega os ids dos tokens
        sentence_labels = sentence["labels"]  # Labels da senten√ßa

        previous_word_idx = None
        label_ids = []
        
        # Atribuindo as labels corretamente
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token (e.g., [CLS], [SEP])
            elif word_idx != previous_word_idx:
                # Verificando se a label existe em label2id
                label = sentence_labels[word_idx]
                if label not in label2id:
                    print(f"Label {label} n√£o encontrada em label2id!")  # Print de depura√ß√£o
                label_ids.append(label2id.get(label, -1))  # Usa -1 se a label n√£o for encontrada
            else:
                label_ids.append(label2id.get(sentence_labels[word_idx], -1) if label_all_tokens else -100)

            previous_word_idx = word_idx

        tokenized_inputs.append(tokenized_sentence)
        labels.append(label_ids)

    # Atribuindo as labels ao tokenized_inputs
    for i, tokenized_sentence in enumerate(tokenized_inputs):
        tokenized_sentence["labels"] = labels[i]

    # Retorna os dados tokenizados e alinhados com as labels
    return {"input_ids": [x["input_ids"] for x in tokenized_inputs],
            "attention_mask": [x["attention_mask"] for x in tokenized_inputs],
            "labels": labels}

# Aplicar a tokeniza√ß√£o a todas as parti√ß√µes de dados
for i in range(10):
    print(f"Tokenizando parti√ß√£o {i}...")
    particoes[f"particao_{i}"] = particoes[f"particao_{i}"].map(tokenize_and_align_labels, batched=True)

print("Tokeniza√ß√£o aplicada com sucesso!")


dict_keys([])
Verificando parti√ß√£o: /home/juliaribeiro/data_division_0
Verificando parti√ß√£o: /home/juliaribeiro/data_division_1
Verificando parti√ß√£o: /home/juliaribeiro/data_division_2
Verificando parti√ß√£o: /home/juliaribeiro/data_division_3
Verificando parti√ß√£o: /home/juliaribeiro/data_division_4
Verificando parti√ß√£o: /home/juliaribeiro/data_division_5
Verificando parti√ß√£o: /home/juliaribeiro/data_division_6
Verificando parti√ß√£o: /home/juliaribeiro/data_division_7
Verificando parti√ß√£o: /home/juliaribeiro/data_division_8
Verificando parti√ß√£o: /home/juliaribeiro/data_division_9
Parti√ß√µes carregadas: ['particao_0', 'particao_1', 'particao_2', 'particao_3', 'particao_4', 'particao_5', 'particao_6', 'particao_7', 'particao_8', 'particao_9']
Label2ID mapping: {'O': 0, 'B-ORGANIZACAO': 1, 'I-ORGANIZACAO': 2, 'B-JURISPRUDENCIA': 3, 'I-JURISPRUDENCIA': 4, 'B-LOCAL': 5, 'I-LOCAL': 6, 'B-LEGISLACAO': 7, 'I-LEGISLACAO': 8, 'B-PESSOA': 9, 'I-PESSOA': 10, 'B-TEMPO': 11, 'I-TEM

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Tokenizando parti√ß√£o 5...
Tokenizando parti√ß√£o 6...
Tokenizando parti√ß√£o 7...
Tokenizando parti√ß√£o 8...
Tokenizando parti√ß√£o 9...
Tokeniza√ß√£o aplicada com sucesso!


## Fine-tuning the model

In [2]:
def criar_modelo(num_labels, attn_dropout, hidden_dropout):
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels,
        hidden_dropout_prob=hidden_dropout,
        attention_probs_dropout_prob=attn_dropout
    )
    return model

In [3]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

def objective(trial, train_set, val_set, num_labels):
    # Amostragem de hiperpar√¢metros
    lr = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    epochs = trial.suggest_categorical("epochs", [2, 3, 4, 8, 16, 32, 64, 128])
    attn_dropout = trial.suggest_uniform("attention_dropout", 0.0, 0.5)
    hidden_dropout = trial.suggest_uniform("hidden_dropout", 0.0, 0.5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.05)

    training_args = TrainingArguments(
        output_dir="./optuna_results",
        evaluation_strategy="epoch",
        save_strategy="no",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        logging_dir="./optuna_logs",
        logging_steps=10,
        report_to="none",
    )

    model = criar_modelo(num_labels, attn_dropout, hidden_dropout)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator= data_collator,
        train_dataset=train_set,
        eval_dataset=val_set,
        compute_metrics=compute_metrics
    )

    trainer.train()
    val_metrics = trainer.evaluate()

    return val_metrics["eval_f1_macro"]  # Otimiza√ß√£o baseada no F1 macro


In [None]:
import os
import torch
import pandas as pd
import random
import numpy as np
from torch.utils.data import Subset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import optuna

# Garantir que os diret√≥rios existam
checkpoints_dir = "./checkpoints"
logs_dir = "./logs_final"
final_results_dir = "./final_results"
optuna_db = "sqlite:///optuna_study.db"

os.makedirs(checkpoints_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)
os.makedirs(final_results_dir, exist_ok=True)

torch.cuda.empty_cache()

num_particoes = 10

# Fun√ß√£o para obter uma amostra de 0.1% dos dados
def get_subset(dataset, fraction=1):
    total_size = len(dataset)
    subset_size = int(total_size * fraction)
    indices = random.sample(range(total_size), subset_size)
    return Subset(dataset, indices)

# Fun√ß√£o para computar as m√©tricas
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []

    # Considerando apenas as classes com prefixo 'B-'
    for pred, label in zip(predictions, labels):
        true_labels.extend([l for l in label if l != -100 and l % 2 == 1])  # Apenas 'B-...'
        pred_labels.extend([p for p, l in zip(pred, label) if l != -100 and l % 2 == 1])  # Apenas 'B-...'

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average=None, labels=[1, 3, 5, 7, 9, 11])  # 'B-...'
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, pred_labels, average="macro")
    accuracy = accuracy_score(true_labels, pred_labels)

    return {
        "precision_per_class": precision.tolist(),
        "recall_per_class": recall.tolist(),
        "f1_per_class": f1.tolist(),
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "accuracy": accuracy
    }

# Carregar progresso se houver checkpoints
test_results = []
completed_iterations = set()

checkpoint_file = os.path.join(checkpoints_dir, "completed_iterations.txt")
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r") as f:
        completed_iterations = set(map(int, f.read().splitlines()))

num_labels = len(set(label for dataset in particoes.values() for sentence in dataset["sentences"] for label in sentence["labels"]))

for i in range(num_particoes):
    if i in completed_iterations:
        print(f"Itera√ß√£o {i} j√° conclu√≠da. Pulando...")
        continue

    test_set = particoes[f"particao_{i}"]
    val_set = particoes[f"particao_{(i+1) % num_particoes}"]
    
    # Pegando 10% de cada parti√ß√£o para treinamento, valida√ß√£o e teste
    test_set = get_subset(test_set, fraction=1)
    val_set = get_subset(val_set, fraction=1)
    
    train_sets = [particoes[f"particao_{j}"] for j in range(num_particoes) if j not in {i, (i+1) % num_particoes}]
    
    # Concatenando os datasets de treino e pegando 10% dos dados para treino
    train_set = torch.utils.data.ConcatDataset(train_sets)
    train_set = get_subset(train_set, fraction=1)

    # Otimiza√ß√£o de hiperpar√¢metros com Optuna
    from optuna.samplers import TPESampler
    study = optuna.create_study(direction="maximize", sampler=TPESampler(), storage=optuna_db, load_if_exists=True)
    study.optimize(lambda trial: objective(trial, train_set, val_set, num_labels), n_trials=25)  # Agora 25 trials

    best_hyperparams = study.best_params
    print(f"Melhores hiperpar√¢metros para a itera√ß√£o {i}: {best_hyperparams}")

    # Re-treina com os melhores hiperpar√¢metros e avalia no conjunto de teste
    train_set = torch.utils.data.ConcatDataset([train_set, val_set])

    # Configura√ß√£o de checkpoints
    checkpoint_path = os.path.join(checkpoints_dir, f"model_checkpoint_{i}")

    # Adicionando o argumento de checkpoints no treinamento
    final_training_args = TrainingArguments(
        output_dir=checkpoint_path,  # Checkpoint espec√≠fico para cada itera√ß√£o
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=best_hyperparams["learning_rate"],
        per_device_train_batch_size=best_hyperparams["batch_size"],
        per_device_eval_batch_size=best_hyperparams["batch_size"],
        num_train_epochs=best_hyperparams["epochs"],
        weight_decay=best_hyperparams["weight_decay"],
        logging_dir=logs_dir,
        logging_steps=10,
        report_to="none",
        overwrite_output_dir=True,
        save_total_limit=3,
    )

    final_model = criar_modelo(num_labels, best_hyperparams["attention_dropout"], best_hyperparams["hidden_dropout"])

    final_trainer = Trainer(
        model=final_model,
        args=final_training_args,
        train_dataset=train_set,
        eval_dataset=test_set,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=compute_metrics
    )

    final_trainer.train()
    test_metrics = final_trainer.evaluate()
    test_results.append(test_metrics)

    # Salvar progresso da itera√ß√£o
    with open(checkpoint_file, "a") as f:
        f.write(f"{i}\n")

# C√°lculo da m√©dia das m√©tricas finais
avg_test_results = {
    metric: np.mean([result[metric] for result in test_results]) for metric in test_results[0]
}

# Criar DataFrame para a tabela desejada
df = pd.DataFrame({
    "organiza√ß√£o_precision": [p["eval_precision_per_class"][0] for p in test_results],
    "organiza√ß√£o_recall": [r["eval_recall_per_class"][0] for r in test_results],
    "organiza√ß√£o_f1": [f["eval_f1_per_class"][0] for f in test_results],
    "jurisprud√™ncia_precision": [p["eval_precision_per_class"][1] for p in test_results],
    "jurisprud√™ncia_recall": [r["eval_recall_per_class"][1] for r in test_results],
    "jurisprud√™ncia_f1": [f["eval_f1_per_class"][1] for f in test_results],
    "local_precision": [p["eval_precision_per_class"][2] for p in test_results],
    "local_recall": [r["eval_recall_per_class"][2] for r in test_results],
    "local_f1": [f["eval_f1_per_class"][2] for f in test_results],
    "legisla√ß√£o_precision": [p["eval_precision_per_class"][3] for p in test_results],
    "legisla√ß√£o_recall": [r["eval_recall_per_class"][3] for r in test_results],
    "legisla√ß√£o_f1": [f["eval_f1_per_class"][3] for f in test_results],
    "pessoa_precision": [p["eval_precision_per_class"][4] for p in test_results],
    "pessoa_recall": [r["eval_recall_per_class"][4] for r in test_results],
    "pessoa_f1": [f["eval_f1_per_class"][4] for f in test_results],
    "tempo_precision": [p["eval_precision_per_class"][5] for p in test_results],
    "tempo_recall": [r["eval_recall_per_class"][5] for r in test_results],
    "tempo_f1": [f["eval_f1_per_class"][5] for f in test_results],
    "precision_macro": [p["eval_precision_macro"] for p in test_results],
    "recall_macro": [r["eval_recall_macro"] for r in test_results],
    "f1_macro": [f["eval_f1_macro"] for f in test_results],
    "accuracy": [a["eval_accuracy"] for a in test_results],
})

print(df)



Itera√ß√£o 0 j√° conclu√≠da. Pulando...
Itera√ß√£o 1 j√° conclu√≠da. Pulando...


[I 2025-03-24 16:59:14,070] A new study created in RDB with name: no-name-49ef7512-c8b1-463a-89ff-51b078f126b7
  lr = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  attn_dropout = trial.suggest_uniform("attention_dropout", 0.0, 0.5)
  hidden_dropout = trial.suggest_uniform("hidden_dropout", 0.0, 0.5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.05)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Per Class,Recall Per Class,F1 Per Class,Precision Macro,Recall Macro,F1 Macro,Accuracy
1,0.2661,0.293687,"[0.9798657718120806, 0.9239130434782609, 0.0, 0.0, 0.9978308026030369, 0.9902439024390244]","[0.1212624584717608, 0.12781954887218044, 0.0, 0.0, 0.5897435897435898, 0.9620853080568721]","[0.21581670362158167, 0.22457067371202113, 0.0, 0.0, 0.741337630942788, 0.9759615384615384]",0.353805,0.163719,0.196153,0.306253
2,0.2052,0.25577,"[0.988009592326139, 0.9918032786885246, 0.0, 0.9857142857142858, 0.9958847736625515, 0.9951100244498777]","[0.34219269102990035, 0.18195488721804512, 0.0, 0.3612565445026178, 0.9307692307692308, 0.9644549763033176]","[0.5083281924737816, 0.30749682337992373, 0.0, 0.5287356321839081, 0.9622266401590457, 0.9795427196149218]",0.450593,0.252784,0.298757,0.503629


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-24 17:03:52,057] Trial 0 finished with value: 0.2987572734374164 and parameters: {'learning_rate': 4.4054115669269505e-06, 'batch_size': 16, 'epochs': 2, 'attention_dropout': 0.41767732901546506, 'hidden_dropout': 0.1977882455075205, 'weight_decay': 0.03590547870852229}. Best is trial 0 with value: 0.2987572734374164.
  lr = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  attn_dropout = trial.suggest_uniform("attention_dropout", 0.0, 0.5)
  hidden_dropout = trial.suggest_uniform("hidden_dropout", 0.0, 0.5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.05)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'clas

Epoch,Training Loss,Validation Loss,Precision Per Class,Recall Per Class,F1 Per Class,Precision Macro,Recall Macro,F1 Macro,Accuracy
1,0.1324,0.155711,"[0.9593810444874274, 0.9975728155339806, 1.0, 0.9932659932659933, 0.9960886571056062, 0.9904534606205251]","[0.8239202657807309, 0.6180451127819548, 0.09302325581395349, 0.7722513089005235, 0.9794871794871794, 0.9834123222748815]","[0.8865058087578195, 0.7632311977715878, 0.1702127659574468, 0.8689248895434463, 0.9877181641887525, 0.9869203329369798]",0.49473,0.355845,0.388626,0.806533
2,0.0766,0.126424,"[0.9584870848708487, 1.0, 0.9523809523809523, 0.9936507936507937, 0.9947984395318595, 0.995085995085995]","[0.8629568106312292, 0.637593984962406, 0.46511627906976744, 0.819371727748691, 0.9807692307692307, 0.9597156398104265]","[0.9082167832167832, 0.7786960514233241, 0.625, 0.8981348637015782, 0.9877340219496449, 0.97708082026538]",0.453416,0.363502,0.398066,0.839196
3,0.0924,0.131552,"[0.9749536178107606, 1.0, 0.9655172413793104, 0.9905362776025236, 0.9947984395318595, 0.9928057553956835]","[0.8729235880398671, 0.6992481203007519, 0.6511627906976745, 0.8219895287958116, 0.9807692307692307, 0.981042654028436]","[0.9211218229623137, 0.8230088495575221, 0.7777777777777778, 0.8984263233190272, 0.9877340219496449, 0.9868891537544696]",0.455278,0.385164,0.414997,0.863484


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-24 17:10:45,313] Trial 1 finished with value: 0.41499676533236574 and parameters: {'learning_rate': 1.2659064432887882e-05, 'batch_size': 16, 'epochs': 3, 'attention_dropout': 0.2244926071485011, 'hidden_dropout': 0.2692015813727828, 'weight_decay': 0.027029407419050945}. Best is trial 1 with value: 0.41499676533236574.
  lr = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  attn_dropout = trial.suggest_uniform("attention_dropout", 0.0, 0.5)
  hidden_dropout = trial.suggest_uniform("hidden_dropout", 0.0, 0.5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.05)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Per Class,Recall Per Class,F1 Per Class,Precision Macro,Recall Macro,F1 Macro,Accuracy
1,0.6072,1.102389,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.06282051282051282, 0.13507109004739337]","[0.0, 0.0, 0.0, 0.0, 0.11821471652593486, 0.23799582463465555]",0.181818,0.01799,0.032383,0.029592
2,0.4189,0.79912,"[0.9655172413793104, 0.0, 0.0, 0.0, 0.9759036144578314, 0.973293768545994]","[0.046511627906976744, 0.0, 0.0, 0.0, 0.10384615384615385, 0.7772511848341233]","[0.08874801901743265, 0.0, 0.0, 0.0, 0.18771726535341832, 0.8642951251646904]",0.264974,0.084328,0.103705,0.129816
3,0.4218,0.61738,"[0.9661016949152542, 1.0, 0.0, 0.0, 0.9947368421052631, 0.9684466019417476]","[0.1420265780730897, 0.015037593984962405, 0.0, 0.0, 0.4846153846153846, 0.9454976303317536]","[0.24764663287472846, 0.02962962962962963, 0.0, 0.0, 0.6517241379310345, 0.9568345323741008]",0.357208,0.144289,0.17144,0.267448
4,0.2938,0.536402,"[0.9850746268656716, 1.0, 0.0, 1.0, 0.9953051643192489, 0.975669099756691]","[0.27408637873754155, 0.10827067669172932, 0.0, 0.002617801047120419, 0.8153846153846154, 0.9502369668246445]","[0.42884990253411304, 0.19538670284938942, 0.0, 0.005221932114882507, 0.8964059196617337, 0.9627851140456183]",0.45055,0.195509,0.226241,0.40201
5,0.3,0.486197,"[0.9837728194726166, 1.0, 0.0, 1.0, 0.9958791208791209, 0.9783132530120482]","[0.40282392026578073, 0.20150375939849624, 0.0, 0.0968586387434555, 0.9294871794871795, 0.9620853080568721]","[0.5715969357690042, 0.3354192740926158, 0.0, 0.1766109785202864, 0.9615384615384616, 0.970131421744325]",0.450724,0.235705,0.274118,0.498883
6,0.2815,0.445024,"[0.9821958456973294, 1.0, 1.0, 0.9788359788359788, 0.9960578186596584, 0.9783653846153846]","[0.5498338870431894, 0.3067669172932331, 0.031007751937984496, 0.48429319371727747, 0.9717948717948718, 0.9644549763033176]","[0.7050053248136315, 0.46950517836593786, 0.06015037593984962, 0.647985989492119, 0.9837767683322518, 0.9713603818615751]",0.539587,0.300741,0.348889,0.619765
7,0.2759,0.393707,"[0.9874371859296482, 1.0, 1.0, 0.9920948616600791, 0.9947437582128777, 0.9808153477218226]","[0.6528239202657807, 0.4120300751879699, 0.05426356589147287, 0.6570680628272252, 0.9705128205128205, 0.9691943127962085]","[0.786, 0.5835995740149095, 0.10294117647058823, 0.7905511811023622, 0.9824789097988319, 0.9749702026221693]",0.496258,0.309658,0.351712,0.693467
8,0.2075,0.375284,"[0.9872241579558653, 1.0, 1.0, 0.9926739926739927, 0.9947780678851175, 0.9855421686746988]","[0.7059800664451827, 0.42857142857142855, 0.05426356589147287, 0.7094240837696335, 0.9769230769230769, 0.9691943127962085]","[0.8232445520581114, 0.6, 0.10294117647058823, 0.8274809160305343, 0.9857697283311773, 0.977299880525687]",0.496685,0.320363,0.359728,0.721385
9,0.1527,0.357857,"[0.986827661909989, 1.0, 1.0, 0.9929577464788732, 0.9947916666666666, 0.9832134292565947]","[0.7466777408637874, 0.43609022556390975, 0.06976744186046512, 0.7382198952879581, 0.9794871794871794, 0.9715639810426541]","[0.8501182033096927, 0.6073298429319371, 0.13043478260869565, 0.8468468468468469, 0.9870801033591732, 0.9773539928486293]",0.496483,0.328484,0.366597,0.740927
10,0.1802,0.353767,"[0.9892818863879957, 1.0, 1.0, 0.9895470383275261, 0.9947916666666666, 0.9879227053140096]","[0.7666112956810631, 0.43909774436090226, 0.08527131782945736, 0.743455497382199, 0.9794871794871794, 0.9691943127962085]","[0.8638277959756668, 0.6102403343782654, 0.15714285714285714, 0.8490284005979073, 0.9870801033591732, 0.9784688995215312]",0.45858,0.306394,0.341984,0.749023


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-24 18:24:49,536] Trial 2 finished with value: 0.4173978470206632 and parameters: {'learning_rate': 2.349508813361874e-06, 'batch_size': 16, 'epochs': 32, 'attention_dropout': 0.45245113937464665, 'hidden_dropout': 0.37915571111399954, 'weight_decay': 0.047470083430072495}. Best is trial 2 with value: 0.4173978470206632.
  lr = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  attn_dropout = trial.suggest_uniform("attention_dropout", 0.0, 0.5)
  hidden_dropout = trial.suggest_uniform("hidden_dropout", 0.0, 0.5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.05)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Per Class,Recall Per Class,F1 Per Class,Precision Macro,Recall Macro,F1 Macro,Accuracy
1,0.1504,0.158705,"[0.9644619940769991, 0.9953161592505855, 1.0, 0.992831541218638, 0.9986928104575163, 0.9809976247030879]","[0.8114617940199336, 0.6390977443609023, 0.09302325581395349, 0.725130890052356, 0.9794871794871794, 0.9786729857819905]","[0.8813712223725756, 0.7783882783882784, 0.1702127659574468, 0.8381240544629349, 0.9889967637540453, 0.9798339264531435]",0.494358,0.352239,0.386411,0.80067
2,0.0724,0.125943,"[0.9760076775431862, 1.0, 0.8648648648648649, 0.987220447284345, 0.9961038961038962, 0.9927184466019418]","[0.8446843853820598, 0.6481203007518797, 0.7441860465116279, 0.8089005235602095, 0.9833333333333333, 0.9691943127962085]","[0.9056099732858415, 0.7864963503649635, 0.8, 0.8892086330935252, 0.9896774193548387, 0.9808153477218226]",0.447455,0.384494,0.411678,0.845617
3,0.1148,0.14135,"[0.9847036328871893, 1.0, 0.9285714285714286, 0.9616519174041298, 0.9948320413436692, 1.0]","[0.8554817275747508, 0.7428571428571429, 0.8062015503875969, 0.8534031413612565, 0.9871794871794872, 0.9976303317535545]","[0.9155555555555556, 0.8524590163934426, 0.8630705394190872, 0.9042995839112344, 0.990990990990991, 0.9988137603795967]",0.45152,0.403289,0.425015,0.878001


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
print(f"N√∫mero de trials conclu√≠dos: {len(study.trials)}")
print(f"N√∫mero de trials com sucesso: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}")

print(test_results)



test_metrics = final_trainer.evaluate()
print(f"Test Metrics (Itera√ß√£o {i}): {test_metrics}")
test_results.append(test_metrics)



N√∫mero de trials conclu√≠dos: 2
N√∫mero de trials com sucesso: 2
[{'eval_loss': 0.1860225796699524, 'eval_precision_per_class': [0.8764478764478765, 0.9973614775725593, 0.9213483146067416, 0.99, 1.0, 0.9752321981424149], 'eval_recall_per_class': [0.920892494929006, 0.8289473684210527, 0.32669322709163345, 0.717391304347826, 0.9829931972789115, 0.9936908517350158], 'eval_f1_per_class': [0.8981206726013847, 0.9053892215568863, 0.4823529411764706, 0.8319327731092437, 0.9914236706689536, 0.984375], 'eval_precision_macro': 0.4431069128284302, 'eval_recall_macro': 0.36696988029257266, 'eval_f1_macro': 0.3918149445471491, 'eval_accuracy': 0.8466711499663753, 'eval_runtime': 2.3681, 'eval_samples_per_second': 327.262, 'eval_steps_per_second': 40.961, 'epoch': 8.0}, {'eval_loss': 0.06802081316709518, 'eval_precision_per_class': [0.9778933680104032, 0.9981785063752276, 1.0, 1.0, 1.0, 1.0], 'eval_recall_per_class': [0.9376558603491272, 0.908036454018227, 0.9865771812080537, 0.9145299145299145, 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for results in class_results:
    print(results)

In [None]:
ovall_results = [list(x.values()) for x in ovall_results]
# ordem dos valores = 
# eval_loss eval_precision | eval_recall | eval_f1 | eval_accuracy | eval_runtime | eval_samples_per_second | eval_steps_per_second | epoch 16.0
ovall_results

-> Estatisticas Resultados

In [None]:
ovall_mean = list(np.mean(ovall_results, axis=0))
ovall_mean

In [None]:
ovall_std = list(np.std(ovall_results, axis=0))
ovall_std

In [None]:
ovall_var = list(np.var(ovall_results, axis=0))
ovall_var

In [None]:
all_in_one_dic_class_results = {clss: [] for clss in list(class_results[0].keys())[:-4]}
# 'precision' | recall | 'f1' | 'number'
# all_in_one_dic_class_results
for class_result in class_results:
    for clss, result in class_result.items():
        if clss not in  list(class_result.keys())[-4:]:
            all_in_one_dic_class_results[clss].append(list(result.values()))
all_in_one_dic_class_results


In [None]:
clss_mean = {clss: [] for clss in all_in_one_dic_class_results.keys()}
for clss, results in all_in_one_dic_class_results.items():
    clss_mean[clss] = list(np.mean(results, axis=0))
clss_mean

In [None]:
clss_std = {clss: [] for clss in all_in_one_dic_class_results.keys()}
for clss, results in all_in_one_dic_class_results.items():
    clss_std[clss] = list(np.std(results, axis=0))
clss_std

In [None]:
clss_var = {clss: [] for clss in all_in_one_dic_class_results.keys()}
for clss, results in all_in_one_dic_class_results.items():
    clss_var[clss] = list(np.var(results, axis=0))
clss_var

In [None]:
# predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
# predictions = np.argmax(predictions, axis=2)

# # Remove ignored index (special tokens)
# true_predictions = [
#     [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#     for prediction, label in zip(predictions, labels)
# ]
# true_labels = [
#     [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#     for prediction, label in zip(predictions, labels)
# ]

# results = metric.compute(predictions=true_predictions, references=true_labels)
# results

In [None]:
trainer.push_to_hub()