In [None]:
!pip install datasets transformers seqeval evaluate

In [5]:
import json
import pandas as pd

from ast import literal_eval

from collections import defaultdict

from datasets import Dataset, DatasetDict

from src import utils

from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification

In [None]:
def tokenize_and_align_labels(examples: dict) -> Dataset:
    """Tokenize and align labels with subword tokens.

    Args:
        examples: Pre-token.

    Returns:
        Tokens with labels.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    all_labels = examples['aspect_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(utils.align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [None]:
# lendo os dados
data_ds = pd.read_csv('../datasets/processed/tv_stratified.csv')

In [None]:
# mudando o formato das colunas
for col in ('tokens', 'aspect_tags'):
    data_ds[col] = data_ds[col].apply(literal_eval)

In [7]:
# tag mapping
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}
label2id = {v: k for k, v in id2label.items()}
label_names = ['O', 'B-ASP', 'I-ASP']

In [None]:
# seprando os folds
cols = ['tokens', 'aspect_tags']

data_ds = DatasetDict({
    f'fold_{fold}': Dataset.from_pandas(data_ds[data_ds.fold == fold][cols], preserve_index=False)
        for fold in data_ds.fold.unique()
})

In [None]:
# initializing the tokenizer
model_checkpoint = 'bert-base-multilingual-cased'
tokenizer = utils.build_tokenizer(model_checkpoint)

In [None]:
# tokenizing and aligning
for fold in data_ds:
    data_ds[fold] = data_ds[fold].map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=data_ds[fold].column_names
    )

In [None]:
# params to data collator
batch_size = 8
columns = data_ds['fold_1'].column_names
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors='tf'
)

In [None]:
# data collator
for fold in data_ds:
    data_ds[fold] = data_ds[fold].to_tf_dataset(
        columns=columns,
        collate_fn=data_collator,
        shuffle=True,
        batch_size=batch_size
    )

In [None]:
# cross-validation
results = defaultdict(dict)
for k in range(len(data_ds.keys())):

    curr_fold = k + 1
    curr_splits = data_ds.copy()

    validation = curr_splits[f'fold_{curr_fold}']
    curr_splits.pop(f'fold_{curr_fold}')

    folds = list(curr_splits.keys())
    train = curr_splits[folds[0]]
    for fold in folds[1:]:
        train = train.concatenate(curr_splits[fold])

    num_epochs = 3
    num_train_steps = len(train) * num_epochs
    optimizer, _ = create_optimizer(
        init_lr=2e-5,
        num_warmup_steps=0,
        num_train_steps=num_train_steps,
        weight_decay_rate=0.01,
    )

    model = utils.build_model(
        model_checkpoint=model_checkpoint,
        id2label=id2label,
        label2id=label2id,
        from_pt=True
    )
    model.compile(optimizer=optimizer)

    model.fit(
        train,
        validation_data=validation,
        epochs=num_epochs
    )

    results[f'split_{curr_fold}'] = utils.evaluate_model(
        model=model,
        test_data=validation,
        label_names=label_names
    )

In [4]:
with open("../results/baseline_tv.json", "w") as outfile:
    json.dump(results, outfile)