In [None]:
import os
import optuna
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoConfig
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np
from datasets import load_from_disk, concatenate_datasets
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

# Configuração do modelo e tarefa
task = "ner"
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
batch_size = 8
label_all_tokens = True

# Carregar o tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Caminho para as partições
base_path = "../models/"

# Carregar as partições diretamente do caminho especificado
tokenized_datasets = {}
for i in range(10):
    partition_path = f"{base_path}dataset_division_{i}"
    if os.path.exists(partition_path):  # Verifica se o diretório existe
        try:
            tokenized_datasets[f"particao_{i}"] = load_from_disk(partition_path)
        except Exception as e:
            print(f"Erro ao carregar a partição {partition_path}: {e}")

# Verificar as chaves disponíveis em uma amostra dos dados
sample = tokenized_datasets["particao_0"][:2]  # Pegando 2 amostras da partição
print("Exemplo de dados:", sample)
print("Chaves disponíveis no dataset:", sample.keys())

# Acessar as labels corretamente para construir label2id
label_list = tokenized_datasets["particao_0"].features["sentences"]["labels"].feature
label_names = label_list.dtype

# Criar o mapeamento de label2id
label2id = {
    'O': 0,               # Fora de entidade
    'B-ORGANIZACAO': 1,    # Início da entidade ORGANIZACAO
    'I-ORGANIZACAO': 2,    # Interior da entidade ORGANIZACAO
    'B-JURISPRUDENCIA': 3, # Início da entidade JURISPRUDENCIA
    'I-JURISPRUDENCIA': 4, # Interior da entidade JURISPRUDENCIA
    'B-LOCAL': 5,          # Início da entidade LOCAL
    'I-LOCAL': 6,          # Interior da entidade LOCAL
    'B-LEGISLACAO': 7,     # Início da entidade LEGISLACAO
    'I-LEGISLACAO': 8,     # Interior da entidade LEGISLACAO
    'B-PESSOA': 9,         # Início da entidade PESSOA
    'I-PESSOA': 10,        # Interior da entidade PESSOA
    'B-TEMPO': 11,         # Início da entidade TEMPO
    'I-TEMPO': 12          # Interior da entidade TEMPO
}

# Exibir o mapeamento de labels
print(f"Label2ID mapping: {label2id}")

# Função para calcular métricas
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Calcular métricas de precisão, recall, F1 e acurácia
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    accuracy = accuracy_score(true_labels, true_predictions)

    # Depuração: Exibindo as métricas para cada classe
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Accuracy: {accuracy}")
    
    # Calculando a média macro das métricas
    macro_precision = np.mean([precision[label] for label in label_names])
    macro_recall = np.mean([recall[label] for label in label_names])
    macro_f1 = np.mean([f1[label] for label in label_names])

    print(f"Macro Precision: {macro_precision}")
    print(f"Macro Recall: {macro_recall}")
    print(f"Macro F1: {macro_f1}")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
    }

# Função para tokenizar e alinhar as labels
def tokenize_and_align_labels(examples):
    # Verificando se 'sentences' está presente nos dados
    if "sentences" not in examples:
        raise KeyError("A chave 'sentences' não foi encontrada nos dados.")

    tokenized_inputs = []
    labels = []

    for sentence in examples["sentences"]:
        # Tokenizando os tokens de cada sentença
        tokenized_sentence = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)

        word_ids = tokenized_sentence.word_ids()  # Pega os ids dos tokens
        sentence_labels = sentence["labels"]  # Labels da sentença

        previous_word_idx = None
        label_ids = []
        
        # Atribuindo as labels corretamente
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token (e.g., [CLS], [SEP])
            elif word_idx != previous_word_idx:
                # Verificando se a label existe em label2id
                label = sentence_labels[word_idx]
                if label not in label2id:
                    print(f"Label {label} não encontrada em label2id!")  # Print de depuração
                label_ids.append(label2id.get(label, -1))  # Usa -1 se a label não for encontrada
            else:
                label_ids.append(label2id.get(sentence_labels[word_idx], -1) if label_all_tokens else -100)

            previous_word_idx = word_idx

        tokenized_inputs.append(tokenized_sentence)
        labels.append(label_ids)

    # Atribuindo as labels ao tokenized_inputs
    for i, tokenized_sentence in enumerate(tokenized_inputs):
        tokenized_sentence["labels"] = labels[i]

    # Retorna os dados tokenizados e alinhados com as labels
    return {"input_ids": [x["input_ids"] for x in tokenized_inputs],
            "attention_mask": [x["attention_mask"] for x in tokenized_inputs],
            "labels": labels}

# Configurações de treinamento
if __name__ == "__main__":
    # Tokenizando e alinhando as labels para todas as partições
    print("Tokenizing datasets...")
    tokenized_datasets = {key: dataset.map(tokenize_and_align_labels, batched=True) for key, dataset in tokenized_datasets.items()}

    hidden_dropout_prob = 0.3506306968358118
    attention_probs_dropout_prob = 0.18770484420356393
    learning_rate = 2.339919292660243e-05
    num_train_epochs = 32

    # Configuração do modelo
    configuration_base = AutoConfig.from_pretrained(
        model_checkpoint, 
        num_labels=len(label2id), 
        label2id=label2id, 
        id2label={i: label for i, label in enumerate(label2id)}
    )
    configuration_base.hidden_dropout_prob = hidden_dropout_prob
    configuration_base.attention_probs_dropout_prob = attention_probs_dropout_prob
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, config=configuration_base)

    # Argumentos de treinamento
    model_name = model_checkpoint.split("/")[-1]
    training_args = TrainingArguments(
        f"{model_name}-finetuned-{task}",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_strategy='no',
        evaluation_strategy="epoch",
    )

    # Treinamento usando todas as 10 partições
    results = []
    per_class_results = []
    sets = [x for x in range(10)]  # Usando todas as 10 partições

    for test_set in sets:
        tokenized_datasets['test'] = tokenized_datasets[f'particao_{test_set}']
        tokenized_datasets['train'] = concatenate_datasets([tokenized_datasets[f'particao_{x}'] for x in sets if x != test_set])
        
        data_collator = DataCollatorForTokenClassification(tokenizer)
        model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, config=configuration_base)
        
        print(f"Training model for test set {test_set}...")
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],  # Passar o dataset de teste aqui
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        trainer.train()
        eval_result = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
        results.append(eval_result)
        trainer.save_model(f'./bertimbal-Save-{test_set}/')

        # Pre class results
        predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        per_class_results.append(SeqEval().compute(predictions=true_predictions, references=true_labels))

    print(f'OVERALL: {results}')
    print(f'PER CLASS: {per_class_results}')


In [None]:

!pip install torch torchvision torchaudio
!pip install "transformers[torch]"



In [3]:
import os
from datasets import Dataset

# Função para processar um arquivo txt e criar o dataset
def txt_to_dataset(file_path):
    sentences = []
    tokens = []
    labels = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({
                        'tokens': tokens,
                        'labels': labels
                    })
                    tokens = []
                    labels = []
                continue

            parts = line.split()
            if len(parts) == 2:
                token, label = parts
            elif len(parts) > 2:
                token = " ".join(parts[:-1])
                label = parts[-1]
            else:
                continue  # ignora linhas vazias ou inválidas

            tokens.append(token)
            labels.append(label)

    if tokens:
        sentences.append({
            'tokens': tokens,
            'labels': labels
        })

    return sentences


# Caminho para a pasta onde os arquivos estão localizados
base_path = "../Base de Dados/cachacaNER/"

# Processando cada arquivo de division_0.txt a division_9.txt
for i in range(1,11):
    file_path = os.path.join(base_path, f"particao_{i}.txt")
    sentences = txt_to_dataset(file_path)
    
    # Criando o dataset para o arquivo atual
    dataset = Dataset.from_dict({"sentences": sentences})
    
    # Salvando o dataset com um nome correspondente ao arquivo
    dataset.save_to_disk(f"dat_division_{i}")
    
    print(f"Dataset para particao_{i} criado com sucesso!")

from datasets import load_from_disk

# Caminho para o diretório onde os datasets estão salvos
save_dir = "../models/"  # ou qualquer outro caminho que você tenha usado

# Número da partição/divisão que você deseja carregar (por exemplo, division_0)
partition_number = 0  # Alterar para o número da partição que deseja carregar

# Caminho para o dataset da partição específica
dataset_path = os.path.join(save_dir, f"dat_particao_{partition_number}")

# Carregando o dataset da partição
dataset = load_from_disk(dataset_path)

# Exibindo as primeiras 5 entradas do dataset
print(dataset[:5])
# Verificando as features de um dos datasets para encontrar a chave correta
print ("TERMINA")
print(tokenized_datasets["particao_0"].features)





Saving the dataset (0/1 shards):   0%|          | 0/1312 [00:00<?, ? examples/s]

Dataset para particao_1 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1326 [00:00<?, ? examples/s]

Dataset para particao_2 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1344 [00:00<?, ? examples/s]

Dataset para particao_3 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1314 [00:00<?, ? examples/s]

Dataset para particao_4 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1363 [00:00<?, ? examples/s]

Dataset para particao_5 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1373 [00:00<?, ? examples/s]

Dataset para particao_6 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1422 [00:00<?, ? examples/s]

Dataset para particao_7 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1327 [00:00<?, ? examples/s]

Dataset para particao_8 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1415 [00:00<?, ? examples/s]

Dataset para particao_9 criado com sucesso!


Saving the dataset (0/1 shards):   0%|          | 0/1432 [00:00<?, ? examples/s]

Dataset para particao_10 criado com sucesso!


FileNotFoundError: Directory ../models/dat_particao_0 not found

In [None]:
ner_dataset

In [None]:
ner_dataset["train"].features[f"ner_tags"]

In [None]:
label_list = ner_dataset["train"].features[f"ner_tags"].feature.names
label_list

In [None]:
# BERT pretrained model
model_id = 'neuralmind/bert-base-portuguese-cased'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import transformers

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
tokenizer("Olá, isto é uma sentença!")

In [None]:
example = ner_dataset["train"][2]
print(example["tokens"])

In [None]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

In [None]:
len(example[f"ner_tags"]), len(tokenized_input["input_ids"])

In [None]:
print(tokenized_input.word_ids())

In [None]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"ner_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

In [None]:
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True, load_from_cache_file=False)

In [None]:
tokenized_datasets["train"]["labels"][2]

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_list), from_pt=True
)

In [None]:
batch_size = 8

In [None]:
from transformers import create_optimizer

num_train_epochs = 3
num_train_steps = (len(tokenized_datasets["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")

In [None]:
train_set = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
validation_set = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)
test_set = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [None]:
import numpy as np
from datasets import load_metric
from transformers.keras_callbacks import KerasMetricCallback

metric = load_metric("seqeval")
labels = [label_list[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

In [None]:
from transformers.keras_callbacks import PushToHubCallback
# from tensorflow.keras.callbacks import TensorBoard

# model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetuned-{task}"

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./tc_model_save/logs")

# push_to_hub_callback = PushToHubCallback(
#     output_dir="./tc_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )

callbacks = [metric_callback, tensorboard_callback]

model.fit(
    
    train_set,
    validation_data=validation_set,
    epochs=num_train_epochs,
    callbacks=callbacks,
)

In [1]:
import os

db_path = "optuna_study.db"

if os.path.exists(db_path):
    os.remove(db_path)
    print("Banco de dados deletado com sucesso.")
else:
    print("O banco de dados não existe.")





Banco de dados deletado com sucesso.


In [1]:
import sqlite3

conn = sqlite3.connect("optuna_study.db")
cursor = conn.cursor()

cursor.execute("SELECT * FROM trials")
print(cursor.fetchall())

conn.close()




[(1, 0, 1, 'COMPLETE', '2025-03-17 17:45:05.971460', '2025-03-17 22:18:47.019100'), (2, 1, 1, 'COMPLETE', '2025-03-17 22:18:47.035072', '2025-03-18 00:52:06.921875'), (3, 2, 1, 'COMPLETE', '2025-03-18 00:52:06.949967', '2025-03-18 01:02:00.231359'), (4, 3, 1, 'COMPLETE', '2025-03-18 01:02:00.246875', '2025-03-18 06:39:38.600319'), (5, 4, 1, 'COMPLETE', '2025-03-18 06:39:38.621693', '2025-03-18 06:50:25.657802'), (6, 5, 1, 'COMPLETE', '2025-03-18 06:50:25.670239', '2025-03-18 07:00:35.832535'), (7, 6, 1, 'COMPLETE', '2025-03-18 07:00:35.845027', '2025-03-18 09:48:10.888379'), (8, 7, 1, 'COMPLETE', '2025-03-18 09:48:10.909911', '2025-03-18 09:53:29.625023'), (9, 8, 1, 'COMPLETE', '2025-03-18 09:53:29.637677', '2025-03-18 12:39:59.181870'), (10, 9, 1, 'COMPLETE', '2025-03-18 12:39:59.194094', '2025-03-18 12:44:38.925507'), (11, 10, 1, 'COMPLETE', '2025-03-18 12:44:38.937566', '2025-03-18 12:48:09.540221'), (12, 11, 1, 'COMPLETE', '2025-03-18 12:48:09.552365', '2025-03-18 12:50:52.469633')

In [None]:
model.predict(validation_set)

In [None]:
model.save_pretrained("./modelNER")

In [None]:
model.config.id2label.values()

In [None]:
from datasets import load_metric
import numpy as np


metric = load_metric("seqeval")


def evaluate(model, dataset, ner_labels):
  all_predictions = []
  all_labels = []
  for batch in dataset:
      logits = model.predict(batch)["logits"]
      labels = batch["labels"]
      predictions = np.argmax(logits, axis=-1)
      for prediction, label in zip(predictions, labels):
          for predicted_idx, label_idx in zip(prediction, label):
              if label_idx == -100:
                #   print(label)
                  continue
              all_predictions.append(ner_labels[predicted_idx])
              all_labels.append(ner_labels[label_idx])
              #print('\npredicted=',ner_labels[predicted_idx], '\nlabel=',ner_labels[label_idx])
  #print("all_predictions=",[all_predictions],'\nall_labels=',[all_labels])
  return metric.compute(predictions=[all_predictions], references=[all_labels])

#results = evaluate(model, tf_eval_dataset, ner_labels=list(model.config.id2label.values()))
results = evaluate(model, test_set, ner_labels=list(model.config.id2label.values()))
results