Esse notebook tem como objetivo validar a arquitetura transformers, somente com o encoder, para tarefas de extracão de aspectos.

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from ast import literal_eval
from collections import Counter
from datasets import Dataset
from src import utils
from src.conlleval import evaluate
from src.models import AEModelConfig, AEModel, CustomNonPaddingTokenLoss
from tensorflow.keras import layers

In [None]:
# lendo os dados
data_df = pd.read_csv('../datasets/processed/tv_stratified.csv')

In [None]:
# mudando o formato das colunas
for col in ('tokens', 'aspect_tags'):
    data_df[col] = data_df[col].apply(literal_eval)

In [None]:
# transformando em o dataframe em Dataset
cols_to_keep = ['tokens', 'aspect_tags']
data_ds = Dataset.from_pandas(data_df[cols_to_keep])

In [None]:
# separando em treino, teste e validacão
data_ds = utils.train_test_val_split(data_ds, test_size=0.1, val_size=0.1)

In [None]:
mapping = utils.make_tag_lookup_table()
print(mapping)

In [None]:
all_tokens = sum(data_ds['train']['tokens'], [])
all_tokens_array = np.array(all_tokens)

counter = Counter(all_tokens_array)
print(len(counter))

In [None]:
num_tags = len(mapping)
vocab_size = 3_100

In [None]:
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

In [None]:
lookup_layer = layers.StringLookup(vocabulary=vocabulary)

In [None]:
train_data = tf.data.TextLineDataset("./data/tv_train.txt")
val_data = tf.data.TextLineDataset("./data/tv_validation.txt")

In [None]:
def convert_to_ids(tokens):
    return lookup_layer(tokens)

In [None]:
batch_size = 32
train_dataset = (
    train_data.map(utils.map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)
val_dataset = (
    val_data.map(utils.map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)

In [None]:
ae_config = AEModelConfig(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ae_model = AEModel(ae_config)

In [None]:
ae_model.compile(optimizer='adam', loss=CustomNonPaddingTokenLoss())
ae_model.fit(train_dataset, epochs=30)

In [None]:
def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return convert_to_ids(tokens)

In [None]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ae_model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)

In [None]:
calculate_metrics(val_dataset)