In [None]:
!pip install datasets transformers seqeval evaluate

In [None]:
import numpy as np
import pandas as pd

from collections import defaultdict
from src import utils
from src import pre_processing
from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification

In [None]:
def tokenize_and_align_labels(examples: dict):
    """Tokenize and align labels with subword tokens.

    Args:
        examples: Pre-token.

    Returns:
        Tokens with labels.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    all_labels = examples['aspect_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(utils.align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [None]:
# pre-processing the data
data_ds = pre_processing.pre_processing_tv_dataset('datasets/data.json')

In [None]:
# tag mapping
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}
label2id = {v: k for k, v in id2label.items()}
label_names = ['O', 'B-ASP', 'I-ASP']

In [None]:
# pre-trained models
models = [
    'neuralmind/bert-base-portuguese-cased',
    'neuralmind/bert-large-portuguese-cased',
    'bert-base-multilingual-cased'
]

# fine tuning
results = defaultdict(list)
for model_checkpoint in models:

    # running 5 times
    for _ in range(5):

        # initializing the tokenizer
        tokenizer = utils.build_tokenizer(model_checkpoint)

        # tokenizing and aligning
        tokenized_dataset = data_ds.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=data_ds.column_names)

        # separating into training, testing and validation
        data = utils.train_test_val_split(
            tokenized_dataset, 
            test_size=0.1, 
            val_size=0.1)

        # creating data collator
        data_collator = DataCollatorForTokenClassification(
            tokenizer=tokenizer,
            return_tensors='tf')
        columns = ['attention_mask', 'input_ids', 'labels', 'token_type_ids']
        tf_dataset = utils.dataset_to_tf_dataset(
            data=data, 
            data_collator=data_collator, 
            columns=columns, 
            batch_size=8)

        # defining the number of epochs and steps
        num_epochs = 3
        num_train_steps = len(tf_dataset['train']) * num_epochs

        # defining the optimizer
        optimizer, _ = create_optimizer(
            init_lr=2e-5,
            num_warmup_steps=0,
            num_train_steps=num_train_steps,
            weight_decay_rate=0.01,)

        # defining the model
        model = utils.build_model(
            model_checkpoint=model_checkpoint, 
            id2label=id2label, 
            label2id=label2id, 
            from_pt=True)
        model.compile(optimizer=optimizer)

        # training the model
        model.fit(
            tf_dataset['train'],
            validation_data=tf_dataset['validation'],
            epochs=num_epochs)

        # evaluating the model
        result = utils.evaluate_model(
            model=model,
            test_data=tf_dataset['test'],
            label_names=label_names)
        results[model_checkpoint].append(result)

In [None]:
# extracting the results
final_results = defaultdict(lambda: defaultdict(list))
for model in results.keys():
    for r in results[model]:
        for metric in ('overall_precision', 'overall_recall', 'overall_f1'):
            final_results[model][metric].append(r[metric])

In [None]:
# average results
avg_results = defaultdict(lambda: defaultdict(float))
for model in final_results.keys():
    for metric in ('overall_precision', 'overall_recall', 'overall_f1'):
        avg_results[model][metric] = np.mean(final_results[model][metric])

In [None]:
# formatando opara melhor visualização
avg_results = pd.DataFrame(avg_results)
avg_results