In [None]:
! pip install transformers sentencepiece datasets conllu wandb

In [None]:
import logging
import os
import ast
import sys
from dataclasses import dataclass, field
from typing import Optional
from pathlib import Path
import glob
import re

from conllu import parse
import pandas as pd
import datasets
import numpy as np
from datasets import ClassLabel, load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    HfArgumentParser,
    PretrainedConfig,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback
)

from transformers.trainer_utils import get_last_checkpoint

import wandb
wandb.login()

## Dataset preparing and utility functions

In [None]:
! rm -rf /content/UD_*

# Porttinari
! git clone https://github.com/huberemanuel/UD_Portuguese-Porttinari.git

# DANTE
! git clone https://github.com/huberemanuel/UD_Portuguese-DANTE.git

# PetroGold
! git clone https://github.com/UniversalDependencies/UD_Portuguese-PetroGold

In [None]:
# Create csv files for loading with datasets CSV format
for dataset_dir in glob.glob("/content/UD_*"):
    for filename in glob.glob(f"{dataset_dir}/*.conllu"):
        set_name = re.findall(r"-(train|dev|test)\.conllu", filename)[0]
        parent_dir = Path(filename).parent
        data = {
            "id": [],
            "tokens": [],
            "tags": [],
        }
        
        with open(filename, "r") as in_f:
            sents = parse(in_f.read())
            
            for sent in sents:
                token_list = []
                tag_list = []
                for token in sent:
                    if isinstance(token["id"], int):
                        token_list.append(token["form"])
                        tag_list.append(token["upos"])

                data["id"].append(sent.metadata["sent_id"])
                data["tokens"].append(token_list)
                data["tags"].append(tag_list)

            df = pd.DataFrame(data)
            df.to_csv(parent_dir.joinpath(f"{set_name}.csv"), index=False)

In [None]:
data_files_bosque = {
    "train": "/content/UD_Portuguese-Bosque-r2.10/train.csv",
    "dev": "/content/UD_Portuguese-Bosque-r2.10/dev.csv",
    "test": "/content/UD_Portuguese-Bosque-r2.10/test.csv",
}

data_files_porttinari = {
    "train": "/content/UD_Portuguese-Porttinari/train.csv",
    "dev": "/content/UD_Portuguese-Porttinari/dev.csv",
    "test": "/content/UD_Portuguese-Porttinari/test.csv",
}

data_files_dante = {
    "train": "/content/UD_Portuguese-DANTE/train.csv",
    "dev": "/content/UD_Portuguese-DANTE/dev.csv",
    "test": "/content/UD_Portuguese-DANTE/test.csv",
}

data_files_petrogold = {
    "train": "/content/UD_Portuguese-PetroGold/train.csv",
    "dev":   "/content/UD_Portuguese-PetroGold/dev.csv",
    "test":  "/content/UD_Portuguese-PetroGold/test.csv",
}

dataset_name = "porttinari"

if "_" in dataset_name:
    dataset_names = dataset_name.split("_")
    data_files = {
        "train": "/content/train.csv",
        "dev": "/content/dev.csv",
        "test": "/content/test.csv",
    }
    train_df = None
    dev_df = None
    test_df = None
    test_file = "/content/test.conllu"
    train_file = "/content/train.conllu"
    dev_file = "/content/dev.conllu"
    ! rm /content/train.conllu /content/dev.conllu /content/test.conllu
    ! touch /content/train.conllu /content/dev.conllu /content/test.conllu
    for dn_name in dataset_names:
        if dn_name == "porttinari":
            train_df = pd.read_csv("/content/UD_Portuguese-Porttinari/train.csv") if train_df is None else train_df.append(pd.read_csv("/content/UD_Portuguese-Porttinari/train.csv"), ignore_index=True)
            dev_df = pd.read_csv("/content/UD_Portuguese-Porttinari/dev.csv") if dev_df is None else dev_df.append(pd.read_csv("/content/UD_Portuguese-Porttinari/dev.csv"), ignore_index=True)
            test_df = pd.read_csv("/content/UD_Portuguese-Porttinari/test.csv") if test_df is None else test_df.append(pd.read_csv("/content/UD_Portuguese-Porttinari/test.csv"), ignore_index=True)
            ! cat "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-train.conllu" >> "/content/train.conllu"
            ! cat "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-dev.conllu" >> "/content/dev.conllu"
            ! cat "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-test.conllu" >> "/content/test.conllu"
        elif dn_name == "dante":
            train_df = pd.read_csv("/content/UD_Portuguese-DANTE/train.csv") if train_df is None else train_df.append(pd.read_csv("/content/UD_Portuguese-DANTE/train.csv"), ignore_index=True)
            dev_df = pd.read_csv("/content/UD_Portuguese-DANTE/dev.csv") if dev_df is None else dev_df.append(pd.read_csv("/content/UD_Portuguese-DANTE/dev.csv"), ignore_index=True)
            test_df = pd.read_csv("/content/UD_Portuguese-DANTE/test.csv") if test_df is None else test_df.append(pd.read_csv("/content/UD_Portuguese-DANTE/test.csv"), ignore_index=True)
            ! cat "/content/UD_Portuguese-DANTE/pt_dante-ud-train.conllu" >> "/content/train.conllu"
            ! cat "/content/UD_Portuguese-DANTE/pt_dante-ud-dev.conllu" >> "/content/dev.conllu"
            ! cat "/content/UD_Portuguese-DANTE/pt_dante-ud-test.conllu" >> "/content/test.conllu"
        elif dn_name == "petrogold":
            train_df = pd.read_csv("/content/UD_Portuguese-PetroGold/train.csv") if train_df is None else train_df.append(pd.read_csv("/content/UD_Portuguese-PetroGold/train.csv"), ignore_index=True)
            dev_df = pd.read_csv("/content/UD_Portuguese-PetroGold/dev.csv") if dev_df is None else dev_df.append(pd.read_csv("/content/UD_Portuguese-PetroGold/dev.csv"), ignore_index=True)
            test_df = pd.read_csv("/content/UD_Portuguese-PetroGold/test.csv") if test_df is None else test_df.append(pd.read_csv("/content/UD_Portuguese-PetroGold/test.csv"), ignore_index=True)
            ! cat "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-train.conllu" >> "/content/train.conllu"
            ! cat "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-dev.conllu" >> "/content/dev.conllu"
            ! cat "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-test.conllu" >> "/content/test.conllu"
    train_df.to_csv(data_files["train"], index=False)
    dev_df.to_csv(data_files["dev"], index=False)
    test_df.to_csv(data_files["test"], index=False)

elif dataset_name == "bosque":
    data_files = data_files_bosque
    test_file = "/content/UD_Portuguese-Bosque-r2.10/pt_bosque-ud-test.conllu"
    train_file = "/content/UD_Portuguese-Bosque-r2.10/pt_bosque-ud-train.conllu"
    dev_file = "/content/UD_Portuguese-Bosque-r2.10/pt_bosque-ud-dev.conllu"
elif dataset_name == "porttinari":
    data_files = data_files_porttinari
    test_file = "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-test.conllu"
    train_file = "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-train.conllu"
    dev_file = "/content/UD_Portuguese-Porttinari/pt_porttinari-ud-dev.conllu"
elif dataset_name == "dante":
    data_files = data_files_dante
    test_file = "/content/UD_Portuguese-DANTE/pt_dante-ud-test.conllu"
    train_file = "/content/UD_Portuguese-DANTE/pt_dante-ud-train.conllu"
    dev_file = "/content/UD_Portuguese-DANTE/pt_dante-ud-dev.conllu"
elif dataset_name == "petrogold":
    data_files = data_files_petrogold
    test_file =  "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-test.conllu"
    train_file = "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-train.conllu"
    dev_file =   "/content/UD_Portuguese-PetroGold/pt_petrogold-ud-dev.conllu"

raw_datasets = load_dataset("csv", data_files=data_files)
raw_datasets

In [None]:
%env WANDB_PROJECT=pos_$dataset_name_test

In [None]:
def str_to_list(example):
    example["tokens"] = ast.literal_eval(example["tokens"])
    example["tags"] = ast.literal_eval(example["tags"])
    return example
    
for set_name, dataset in raw_datasets.items():
    raw_datasets[set_name] = dataset.map(str_to_list)

In [None]:
raw_datasets["train"]["tokens"][0], raw_datasets["train"]["tags"][0]

In [None]:
column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features
text_column_name = "tokens"
label_column_name = "tags"

def get_label_list(labels):
    unique_labels = set()
    for tags in labels:
        for tag in tags:
            unique_labels.add(tag)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

label_list = get_label_list(raw_datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)}

num_labels = len(label_list)
num_labels

In [None]:
label_list

Utility functions

In [None]:
def create_clean_pred_file(test_file: str, dataset_name: str, set_name: str, method = "first"):
    with open(test_file, "r") as in_f:
        sents = parse(in_f.read())
        for sent in sents:
            for token in sent:
                if isinstance(token["id"], int):
                    token["upos"] = "_"
        pred_file = f"{dataset_name}_{set_name}_pred_{method}.conllu"
        with open(pred_file, "w") as out_f:
            out_f.writelines([sentence.serialize() + "\n" for sentence in sents])
    return pred_file

from collections import Counter
import torch
def create_output_pred_file(pred_file: str, test_preds, method: str="first"):
    with open(pred_file, "r+") as in_f:
        sents = parse(in_f.read())
        for s, sent in enumerate(sents):
            outputs = test_preds[sent.metadata["sent_id"]]
            preds = outputs["preds"].argmax(axis=-1)
            raw_preds = outputs["preds"]
            labels = outputs["labels"]
            offsets = outputs["offset_mapping"]

            final_preds = []
            aux = []
            aux_scores = []

            if method == "first":
                for t, (raw_pred, pred, label, offset) in enumerate(zip(raw_preds, preds, labels, offsets)):
                    if offset[0] == 0 and label != -100:
                        final_preds.append(model.config.id2label[pred])
            else:
                for t, (raw_pred, pred, label, offset) in enumerate(zip(raw_preds, preds, labels, offsets)):
                    if label == -100:
                        continue
                    score = 100 * float(torch.nn.functional.softmax(torch.tensor(raw_pred), dim=0).max())

                    if offset[0] == 0 and len(aux) <= 1:
                        final_preds.append(model.config.id2label[pred])
                        aux = [final_preds[-1]]
                        aux_scores = [score]
                    elif offset[0] == 0:
                        # Processing past tokens
                        if method == "max":
                            new_label = aux[np.argmax(aux_scores)]
                            # print("Adding Max value for tokens {}-{} -> {}".format(t - len(aux), t - 1, final_preds[-1]))
                        elif "most_voted":
                            new_label = Counter(aux).most_common(1)[0][0]
                        elif "last":
                            new_label = aux[-1]
                        final_preds[-1] = new_label

                        # Processing new token
                        final_preds.append(model.config.id2label[pred])
                        aux = [final_preds[-1]]
                        aux_scores = [score]
                    elif offset[0] != 0:
                        aux.append(model.config.id2label[pred])
                        aux_scores.append(score)
            
            sent_tokens = 0

            for i, token in enumerate(sent):
                if isinstance(token["id"], int):
                    try:
                        token["upos"] = final_preds[sent_tokens]
                        sent_tokens += 1
                    except:
                        print("Error on sentence {} - token {}-{}".format(sent.metadata["sent_id"], i, token["form"]))

            if method != "first" and sent_tokens - len(final_preds) == 1:
                if method == "max":
                    final_preds.append( aux[np.argmax(aux_scores)] )
                elif method == "most_voted":
                    curr_label = Counter(aux).most_common(1)[0][0]
                    final_preds.append( curr_label )
                elif method == "last":
                    final_preds.append( aux[-1] )
                else:
                    raise Exception("Method `{}` not supported to deal with in the last phase :(".format(method))    

            assert len(final_preds) == sent_tokens, f"Sent have {sent_tokens} tokens and pred have {len(final_preds)} tokens."
        
        in_f.seek(0)
        in_f.writelines([sentence.serialize() + "\n" for sentence in sents])

In [None]:
import json

def create_pred_file(set_name: str, trainer, conllu_file: str, dataset_name: str):
    if set_name not in ["train", "test", "dev"]:
        raise ValueError("set_name must be [train, test, dev]")
    
    preds, labels, metrics = trainer.predict(tokenized_datasets[set_name])
    with open(f"{dataset_name}_{set_name}_pred_results.json", "w") as out_f:
        out_f.write(json.dumps(metrics))
    print(metrics)

    new_preds = {}
    for tokenized, pred, label in zip(tokenized_datasets[set_name], preds, labels):
        new_preds[tokenized["id"]] = {
            "preds": pred,
            "labels": label,
            "offset_mapping": tokenized["offset_mapping"],
        }

    pred_file = create_clean_pred_file(conllu_file, dataset_name, set_name, method="first")
    create_output_pred_file(pred_file, new_preds, method="first")

Loading model config

In [None]:
# model_name = "neuralmind/bert-base-portuguese-cased"
# model_name = "microsoft/mdeberta-v3-base"
# model_name = "xlm-roberta-base"
model_name = "neuralmind/bert-base-portuguese-cased"
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="pos",
)
config

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config,
)

if not isinstance(tokenizer, PreTrainedTokenizerFast):
    raise ValueError(
        "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
        " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
        " this requirement"
    )

In [None]:
if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
    if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
        label_list = [model.config.id2label[i] for i in range(num_labels)]
        label_to_id = {l: i for i, l in enumerate(label_list)}
    else:
        logging.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:"
            f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
        )

# Set the correspondences label/ID inside the model config
model.config.label2id = {l: i for i, l in enumerate(label_list)}
model.config.id2label = {i: l for i, l in enumerate(label_list)}
model.config.id2label

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens = False): # Modifiquei aqui de novo
    def replace_token(token: str) -> str:
        if token in ["", "", "" "", "", "", "", "", "", "", "", "", "", "", "", "–", "", ""]:
            return "*"
        return token
        
    for i, sample in enumerate(examples["tokens"]):
        examples["tokens"][i] = [replace_token(tk) for tk in sample]
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True,
        return_offsets_mapping=True,
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(model.config.label2id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(model.config.label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

example = raw_datasets["train"][:1]
res = tokenize_and_align_labels(example)
display(res)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

## Fine-tuning a model

In [None]:
# Run 10 experiments
for s in range(10):
    batch_size = 32 # 16 for DeBERTa-base
    seed = 42+s
    args = TrainingArguments(
        f"{model_name}-finetuned-pos",
        evaluation_strategy="steps",
        eval_steps=250,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=30,
        weight_decay=0.01,
        push_to_hub=False,
        report_to="wandb",
        load_best_model_at_end=True,
        seed=seed,
        logging_steps=50
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)
    acc_metric = load_metric("accuracy")
    f1_metric = load_metric("f1")

    labels = [model.config.label2id[i] for i in example["tags"][0]]
    acc_metric.compute(predictions=labels, references=labels),\
    f1_metric.compute(predictions=labels, references=labels, average="macro")

    import numpy as np

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [p for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [l for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        true_predictions = np.concatenate(true_predictions)
        true_labels = np.concatenate(true_labels)
        acc = acc_metric.compute(predictions=true_predictions, references=true_labels)
        f1 = f1_metric.compute(predictions=true_predictions, references=true_labels, average="macro")
        return {
            "acc": acc["accuracy"],
            "f1": f1["f1"],
        }

    run = wandb.init(
        project=f"pos_{dataset_name}", 
        name=f"{model_name}-finetuned-pos-{seed}",
        tags=[model_name]
    )
    %env WANDB_LOG_MODEL=true

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["dev"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    trainer.evaluate()

    import json

    create_pred_file("test", trainer, test_file, dataset_name)
    create_pred_file("train", trainer, train_file, dataset_name)

    artifact = wandb.Artifact("preds", type='dataset')

    for filename in [
        f"/content/{dataset_name}_test_pred_first.conllu",
        f"/content/{dataset_name}_train_pred_first.conllu",
    ]:
        artifact.add_file(filename)
    run.log_artifact(artifact)