In [3]:
! pip install datasets transformers seqeval evaluate

  from pkg_resources import load_entry_point


In [4]:
import transformers

print(transformers.__version__)

4.41.0


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
batch_size = 16

## Loading the dataset

In [6]:
from datasets import load_dataset
datasets = load_dataset("lfcc/portuguese_ner")
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3716
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 930
    })
})

In [7]:
label_list = datasets["train"].features["ner_tags"].feature.names
print(label_list)

['O', 'B-Data', 'I-Data', 'B-Local', 'I-Local', 'B-Organizacao', 'I-Organizacao', 'B-Pessoa', 'I-Pessoa', 'B-Profissao', 'I-Profissao']


## Preprocessing the data

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
text = "As aulas de NLP são interessantes!"
tokenized_input = tokenizer(text)
tokenized_input

{'input_ids': [101, 510, 6880, 125, 248, 18353, 453, 20764, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokens = ["As", "aulas", "de", "NLP", "são", "interessantes","!"]
tokenized_input = tokenizer(tokens, is_split_into_words=True)
tokenized_input

{'input_ids': [101, 510, 6880, 125, 248, 18353, 453, 20764, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
new_tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(new_tokens)

['[CLS]', 'As', 'aulas', 'de', 'N', '##LP', 'são', 'interessantes', '!', '[SEP]']


In [12]:
len(tokens), len(new_tokens)

(7, 10)

map special tokens to None and all other tokens to their
respective word index

In [13]:
print(tokenized_input.word_ids())

[None, 0, 1, 2, 3, 3, 4, 5, 6, None]


In [14]:
def tokenize_and_align_labels(samples):
    tokenized_inputs = tokenizer(samples["tokens"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(samples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[101, 1656, 3347, 131, 5523, 6046, 18961, 122, 4216, 10780, 151, 1479, 119, 11019, 122, 120, 291, 17642, 173, 187, 11964, 18394, 117, 1838, 1479, 117, 8852, 5892, 125, 18868, 7286, 7545, 22308, 6213, 15289, 22301, 122, 3410, 113, 291, 806, 114, 13056, 171, 5463, 119, 102], [101, 1656, 3347, 131, 9884, 12548, 122, 17495, 8451, 119, 11019, 122, 120, 291, 17642, 173, 354, 9369, 12234, 17807, 213, 13292, 18394, 6538, 117, 617, 22280, 3981, 2848, 22280, 117, 8852, 5892, 125, 213, 15289, 4529, 122, 3410, 113, 291, 806, 114, 5427, 10847, 257, 16379, 22327, 119, 102], [101, 2627, 283, 125, 13268, 182, 171, 475, 5131, 714, 125, 2205, 8398, 385, 1374, 117, 7241, 173, 2939, 119, 18838, 119, 2952, 22338, 122, 475, 5131, 2303, 107, 113, 100, 114, 240, 1564, 171, 3056, 125, 4406, 171, 6775, 622, 117, 113, 100, 114, 107, 117, 229, 2567, 171, 7212, 171, 496, 117, 2020, 22220, 119, 102], [101, 10607, 119, 4654, 2611, 119, 102], [101, 2435, 22330, 118, 1433, 118, 13778, 120, 2435, 22330, 

To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

In [16]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3716
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 930
    })
})

## Fine-tuning the model

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# mapping between class indices and human-readable labels
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list),
                                                        id2label=id2label, label2id=label2id)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
output_model_name = "my_model_"
args = TrainingArguments(
    output_model_name,
    report_to="none",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, #16
    per_device_eval_batch_size=batch_size, #16
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,      # Load the best model after training
    metric_for_best_model="f1",       # Or any metric you use for evaluation
    greater_is_better=True,           # True if higher F1/Accuracy is better
    #save_total_limit=1
    #push_to_hub=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example).

In [20]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric via the Datasets library.

In [21]:
import evaluate
metric = evaluate.load("seqeval")

Using the latest cached version of the module from /home/lfc/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Wed May 22 18:44:28 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [22]:
import numpy as np

def compute_metrics(p):
    print(p)
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
import torch

torch.cuda.is_available()

True

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.072903,0.934643,0.964229,0.949205,0.982617
2,No log,0.066901,0.944205,0.969611,0.956739,0.984368
3,0.128600,0.066115,0.946489,0.968661,0.957447,0.984675


<transformers.trainer_utils.EvalPrediction object at 0x7fe6e9235f70>
<transformers.trainer_utils.EvalPrediction object at 0x7fe6e9277eb0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe6e94e2f70>


TrainOutput(global_step=699, training_loss=0.10154576908706425, metrics={'train_runtime': 180.8548, 'train_samples_per_second': 61.641, 'train_steps_per_second': 3.865, 'total_flos': 658641856754904.0, 'train_loss': 0.10154576908706425, 'epoch': 3.0})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

<transformers.trainer_utils.EvalPrediction object at 0x7f4177ec39d0>


{'eval_loss': 0.06827212870121002,
 'eval_precision': 0.9443929564411492,
 'eval_recall': 0.9677113010446344,
 'eval_f1': 0.9559099437148217,
 'eval_accuracy': 0.9840616516332429,
 'eval_runtime': 6.8671,
 'eval_samples_per_second': 135.429,
 'eval_steps_per_second': 8.592,
 'epoch': 3.0}

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [29]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

<transformers.trainer_utils.EvalPrediction object at 0x7fe6e90f8880>


{'Data': {'precision': 0.976545842217484,
  'recall': 0.9828326180257511,
  'f1': 0.9796791443850267,
  'number': 466},
 'Local': {'precision': 0.9706744868035191,
  'recall': 0.9792899408284024,
  'f1': 0.9749631811487481,
  'number': 1014},
 'Organizacao': {'precision': 0.6230769230769231,
  'recall': 0.7714285714285715,
  'f1': 0.6893617021276597,
  'number': 105},
 'Pessoa': {'precision': 0.9698630136986301,
  'recall': 0.9819694868238558,
  'f1': 0.9758787043418332,
  'number': 1442},
 'Profissao': {'precision': 0.7417218543046358,
  'recall': 0.8484848484848485,
  'f1': 0.7915194346289753,
  'number': 132},
 'overall_precision': 0.9464893287967832,
 'overall_recall': 0.9686609686609686,
 'overall_f1': 0.9574468085106382,
 'overall_accuracy': 0.9846746650319643}

In [None]:
#trainer.save_model("my_best")

In [None]:
#trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/lfcc/my_model_/commit/6c17b141e09c8b6b672f533964150781dcbb6899', commit_message='End of training', commit_description='', oid='6c17b141e09c8b6b672f533964150781dcbb6899', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from transformers import pipeline

# Load the model and tokenizer from saved directory
#lfcc/bert-portuguese-ner
#ner_pipeline = pipeline("token-classification", model="./ner_model", aggregation_strategy="first")
#ner_pipeline = pipeline("token-classification", model=model, tokenizer= tokenizer, aggregation_strategy="first")
ner_pipeline = pipeline("token-classification", model="lfcc/bert-portuguese-ner", aggregation_strategy="first")

# Run inference
text = """O João Paulo, médico, vive no Porto (Portugal) desde 2024.
A Soraia Marques obeteve o seu mestrado na Universidade do Minho em 26 de Maio de 2021.
Depois de obter o grau, foi trabalhar para o o tribunal de Braga como juíza."
"""

results = ner_pipeline(text)

# Display results
for entity in results:
    print(f"{entity['word']} -> {entity['entity_group']} ({entity['score']:.3f})")


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


João Paulo -> Pessoa (0.964)
médico -> Profissao (0.760)
Porto -> Local (0.973)
Portugal -> Local (0.964)
2024 -> Data (0.898)
Soraia Marques -> Pessoa (0.975)
Universidade do Minho -> Organizacao (0.875)
26 de Maio de 2021 -> Data (0.987)
tribunal -> Organizacao (0.557)
Braga -> Local (0.960)
juíza -> Profissao (0.791)


In [19]:
text2 = """William Carr Beresford (1768–1854), conhecido como General Beresford, foi uma figura militar de destaque durante as Guerras Napoleónicas, especialmente em território português. Oficial britânico de carreira, desempenhou um papel central como comandante das forças portuguesas aliadas ao Reino Unido, tendo sido nomeado Marechal do Exército Português em 1809. Responsável pela reorganização do Exército português segundo modelos britânicos, Beresford foi fundamental para a vitória luso-britânica na Batalha de Albuera (1811) e em outras campanhas peninsulares. A informação ficou disponível em 1999 caixas"""

results = ner_pipeline(text2)

# Display results
for entity in results:
    print(f"{entity['word']} -> {entity['entity_group']} ({entity['score']:.3f})")


William Carr Beresford -> Pessoa (0.981)
1768 -> Data (0.981)
1854 -> Data (0.983)
General -> Profissao (0.780)
Beresford -> Pessoa (0.617)
Oficial -> Profissao (0.693)
Reino Unido -> Local (0.417)
Marechal do Exército -> Profissao (0.778)
Português -> Organizacao (0.398)
1809 -> Data (0.966)
Beresford -> Pessoa (0.705)
Albuera -> Local (0.951)
1811 -> Data (0.967)
1999 -> Data (0.524)
