In [None]:
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

from peft import (
    get_peft_config, 
    PeftModel, 
    PeftConfig, 
    get_peft_model, 
    LoraConfig, 
    TaskType
)

import evaluate
import torch
import numpy as np
import optuna

In [None]:
data_path = '../datasets/stratified/tv.jsonl'
model_checkpoint = 'neuralmind/bert-base-portuguese-cased'

In [None]:
data = load_dataset('json', data_files=data_path)

In [None]:
# separando os dados
splited_data = DatasetDict({
    'train': data['train'].filter(lambda example: example['fold'] >= 1 and example['fold'] <= 8),
    'validation': data['train'].filter(lambda example: example['fold'] == 9),
    'test': data['train'].filter(lambda example: example['fold'] == 10),
})

for each in ('train', 'test', 'validation'):
    splited_data[each] = splited_data[each] \
        .remove_columns(['fold']) \
        .rename_column('aspect_tags', 'tags')

In [None]:
seqeval = evaluate.load("seqeval")

In [None]:
label_list = ['O', 'B-ASP', 'I-ASP']


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_data = splited_data.map(tokenize_and_align_labels, batched=True)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}
label2id = {v: k for k, v in id2label.items()}

### Optimization of hyperparameters

In [None]:
def objective(trial):

    try:

        # Hyperparameters to be tuned by Optuna.
        r = trial.suggest_int("r", 8, 32)
        lora_alpha = trial.suggest_int("lora_alpha", 8, 32)
        lora_dropout = trial.suggest_float("lora_dropout", 0.1, 0.5)
        bias_option = trial.suggest_categorical('bias', ['none', 'all', 'lory_only'])
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
        batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
        weight_decay = trial.suggest_float("weight_decay", 0, 0.1)
        epochs = trial.suggest_int("epochs", 2, 20)


        # Load pretrained model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained(
            model_checkpoint, 
            num_labels=len(label_list), 
            id2label=id2label, 
            label2id=label2id
        )

        # Load PEFT config
        peft_config = LoraConfig(
            task_type=TaskType.TOKEN_CLS, 
            r=r, 
            lora_alpha=lora_alpha, 
            lora_dropout=lora_dropout, 
            bias=bias_option
        )

        # Get PEFT model and print trainable parameters
        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()

        # Training arguments
        training_args = TrainingArguments(
            output_dir="bert-portuguese-aspect-extraction",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )

        # Trainer instance
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_data["train"],
            eval_dataset=tokenized_data["validation"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        # Train model
        trainer.train()

        # Evaluate model
        predictions = trainer.predict(tokenized_data["validation"])
        result = compute_metrics((predictions.predictions, predictions.label_ids))

        return result['f1']
    
    except:

        return 0.0

In [13]:
# Configuração da otimização com Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

In [None]:
# Melhores hiperparâmetros
print(study.best_params)

### Fine-tuning

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(label_list), 
    id2label=id2label, 
    label2id=label2id
)

In [None]:
# best hyperparameters
r = 22
lora_alpha = 29
lora_dropout = 0.3349449195238189
bias = 'all'
learning_rate = 0.0009978611060488306
batch_size = 8
weight_decay = 0.06270649322978346
epochs = 13

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, 
    r=r, 
    lora_alpha=lora_alpha, 
    lora_dropout=lora_dropout, 
    bias=bias
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir="bert-portuguese-aspect-extraction",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_data["test"])
result = compute_metrics((predictions.predictions, predictions.label_ids))

In [None]:
result