In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from ast import literal_eval
from collections import Counter
from datasets import Dataset
from src import utils
from src.conlleval import evaluate
from src.models import AEModelConfig, AEModel, CustomNonPaddingTokenLoss
from tensorflow.keras import layers

In [2]:
# lendo os dados
data_df = pd.read_csv('../datasets/processed/tv_stratified.csv')

In [3]:
# mudando o formato das colunas
for col in ('tokens', 'aspect_tags'):
    data_df[col] = data_df[col].apply(literal_eval)

In [4]:
# transformando em o dataframe em Dataset
cols_to_keep = ['tokens', 'aspect_tags']
data_ds = Dataset.from_pandas(data_df[cols_to_keep])

In [5]:
# separando em treino, teste e validacão
data_ds = utils.train_test_val_split(data_ds, test_size=0.1, val_size=0.1)

In [None]:
os.mkdir('data')
utils.save_data_to_file('./data/tv_train.txt', data_ds['train'])
utils.save_data_to_file('./data/tv_validation.txt', data_ds['validation'])
utils.save_data_to_file('./data/tv_test.txt', data_ds['test'])

In [6]:
mapping = utils.make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-ASP', 3: 'I-ASP'}


In [7]:
all_tokens = sum(data_ds['train']['tokens'], [])
all_tokens_array = np.array(all_tokens)

counter = Counter(all_tokens_array)
print(len(counter))

3158


In [8]:
num_tags = len(mapping)
vocab_size = 3_100

In [9]:
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

In [10]:
lookup_layer = layers.StringLookup(vocabulary=vocabulary)

2023-01-06 09:03:55.924166: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-06 09:03:55.955377: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-06 09:03:55.955575: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-06 09:03:55.956296: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [11]:
train_data = tf.data.TextLineDataset("./data/tv_train.txt")
val_data = tf.data.TextLineDataset("./data/tv_validation.txt")

In [13]:
def convert_to_ids(tokens):
    return lookup_layer(tokens)

In [14]:
batch_size = 32
train_dataset = (
    train_data.map(utils.map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)
val_dataset = (
    val_data.map(utils.map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)

In [15]:
ae_config = AEModelConfig(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ae_model = AEModel(ae_config)

In [16]:
ae_model.compile(optimizer='adam', loss=CustomNonPaddingTokenLoss())
ae_model.fit(train_dataset, epochs=30)

Epoch 1/30


  return dispatch_target(*args, **kwargs)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fbcf43838e0>

In [17]:
def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return convert_to_ids(tokens)

In [18]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ae_model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)

In [19]:
calculate_metrics(val_dataset)

processed 3132 tokens with 216 phrases; found: 274 phrases; correct: 151.
accuracy:  63.14%; (non-O)
accuracy:  93.52%; precision:  55.11%; recall:  69.91%; FB1:  61.63
              ASP: precision:  55.11%; recall:  69.91%; FB1:  61.63  274
