In [None]:
import json
import spacy
import optuna
import evaluate

import numpy as np
import pandas as pd
import tensorflow as tf

from ast import literal_eval
from datasets import Dataset, DatasetDict
from src import utils

from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers.experimental import AdamW

from transformers import (
    TFBertModel,
    DataCollatorForTokenClassification
)

In [None]:
# parâmetros glabais
data_path = '../datasets/stratified/tv.csv'
general_domain_checkpoint = 'neuralmind/bert-base-portuguese-cased'
specific_domain_checkpoint = 'jcfneto/bert-tv-portuguese'
max_length = 512
num_classes = 3
batch_size = 8

In [None]:
data = pd.read_csv(data_path)

cols = ['tokens', 'aspect_tags']
for col in cols:
    data[col] = data[col].apply(literal_eval)
    
indexs = []
nlp = spacy.load('pt_core_news_sm')

postags = []
for i, tokens in enumerate(data.tokens):
    sentence = ' '.join(tokens)
    doc = nlp(sentence)
    pos = [token.pos_ for token in doc]
    postags.append(pos)
    if len(pos) != len(tokens):
        indexs.append(i)

data['pos'] = postags

print(data.shape)
for idx in indexs:
    data = data.drop(index=idx)
print(data.shape)

# tag mapping
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}
label2id = {v: k for k, v in id2label.items()}
label_names = ['O', 'B-ASP', 'I-ASP']

pos = data.pos.values
pos = set([tag for p in pos for tag in p])

ids = np.linspace(-1, 1, 16)

pos2id = {tag: i for i, tag in zip(ids, pos)}
id2pos = {i: tag for tag, i in pos2id.items()}

def tag2id(example):
    return [label2id[tag] for tag in example]

def postag2id(example):
    return [pos2id[tag] for tag in example]

data.aspect_tags = data.aspect_tags.apply(lambda x: tag2id(x))
data.pos = data.pos.apply(lambda x: postag2id(x))

tokenizer = utils.build_tokenizer(general_domain_checkpoint)

In [None]:
def align_labels_with_tokens(labels: list, word_ids: list):
    """Aligns labels with subword tokens.

    Args:
        labels: List with labels.
        word_ids: Word index.

    Returns:
        List with labels aligned.
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels


def tokenize_and_align_labels(examples: dict) -> Dataset:
    """Tokenize and align labels with subword tokens.

    Args:
        examples: Pre-token.

    Returns:
        Tokens with labels.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        padding='max_length',
        max_length=max_length,
        truncation=True,
        is_split_into_words=True,
    )
    all_aspect_labels = examples['aspect_tags']
    new_aspect_labels = []
    new_pos_labels = []
    for i, labels in enumerate(all_aspect_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_aspect_labels.append(align_labels_with_tokens(labels, word_ids) )
    tokenized_inputs['aspect_labels'] = new_aspect_labels
    return tokenized_inputs

In [None]:
cols = ['tokens', 'aspect_tags', 'pos']

data_ds = DatasetDict({
    f'fold_{fold}': Dataset.from_pandas(
    data[data.fold == fold][cols], 
    preserve_index=False
    )
        for fold in data.fold.unique()
})


for fold in data_ds:
    data_ds[fold] = data_ds[fold].map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=data_ds[fold].column_names
    )


# params to data collator
columns = data_ds['fold_1'].column_names
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors='tf'
)


# data collator
for fold in data_ds:
    data_ds[fold] = data_ds[fold].to_tf_dataset(
        columns=columns,
        collate_fn=data_collator,
        shuffle=False,
        batch_size=batch_size
    )

folds = list(data_ds.keys())
train = data_ds[folds[0]]

for fold in folds[1:8]:
    train = train.concatenate(data_ds[fold])

test = data_ds[folds[8]]
validation = data_ds[folds[9]]

In [None]:
class F1ScoreCallback(Callback):
    def __init__(self, validation_data, label_names):
        super(F1ScoreCallback, self).__init__()
        self.validation_data = validation_data
        self.label_names = label_names

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        f1_score = val(self.model, self.validation_data, self.label_names)
        logs["val_f1_score"] = f1_score


def val(model, test_data, label_names):
    all_labels = []
    all_predictions = []
    metric = evaluate.load("seqeval")
    for batch in test_data:
        logits = model.predict_on_batch(batch)
        labels = batch["aspect_labels"]
        predictions = tf.argmax(logits, axis=-1)
        for prediction, label in zip(predictions, labels):
            for predicted_idx, label_idx in zip(prediction, label):
                if label_idx == -100:
                    continue
                all_predictions.append(label_names[predicted_idx])
                all_labels.append(label_names[label_idx])
    results = metric.compute(
        predictions=[all_predictions],
        references=[all_labels]
    )

    return results['overall_f1']


def val2(model, test_data, label_names):
    all_labels = []
    all_predictions = []
    metric = evaluate.load("seqeval")
    for batch in test_data:
        logits = model.predict_on_batch(batch)
        labels = batch["aspect_labels"]
        predictions = tf.argmax(logits, axis=-1)
        for prediction, label in zip(predictions, labels):
            for predicted_idx, label_idx in zip(prediction, label):
                if label_idx == -100:
                    continue
                all_predictions.append(label_names[predicted_idx])
                all_labels.append(label_names[label_idx])
    results = metric.compute(
        predictions=[all_predictions],
        references=[all_labels]
    )

    return results


def expand_data(data):
    if isinstance(data, dict):
        return data
    else:
        return {k: v for k, v in zip(data.keys(), data)}

## Concat

In [None]:
class AEConcatPosTag(Model):

    def __init__(
        self,
        bert_general_domain,
        bert_specific_domain,
        dropout_rate,
        n_classes=3,
        train_bert_layer=False,
    ):
        super(AEConcatPosTag, self).__init__()
        self.bert_general_domain = bert_general_domain
        self.bert_specific_domain = bert_specific_domain

        self.bert_general_domain.trainable = train_bert_layer
        self.bert_specific_domain.trainable = train_bert_layer

        self.concat = layers.Concatenate()

        self.dropout = layers.Dropout(dropout_rate)
        
        self.classifier = layers.Dense(
            n_classes, activation='softmax'
        )

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        attention_mask = inputs['attention_mask']

        output_general_domain = self.bert_general_domain([
            input_ids, token_type_ids, attention_mask
        ])

        output_specific_domain = self.bert_specific_domain([
            input_ids, token_type_ids, attention_mask
        ])

        concatened = tf.concat([
            output_general_domain['last_hidden_state'],
            output_specific_domain['last_hidden_state'],
        ], axis=-1)

        logits = self.classifier(self.dropout(concatened))

        return logits
    
    def train_step(self, data):
        data = expand_data(data)
        inputs = {
            k: data[k] for k in (
                'input_ids', 
                'token_type_ids', 
                'attention_mask',
            )
        }
        labels = data['aspect_labels']

        with tf.GradientTape() as tape:
            logits = self(inputs, training=True)
            loss = self.compiled_loss(
                labels, 
                logits, 
                regularization_losses=self.losses
            )

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        self.compiled_metrics.update_state(labels, logits)
        return {m.name: m.result() for m in self.metrics}

In [None]:
model = AEConcatPosTag(
    bert_general_domain=TFBertModel.from_pretrained(general_domain_checkpoint),
    bert_specific_domain=TFBertModel.from_pretrained(specific_domain_checkpoint),
    dropout_rate=0.3,
    train_bert_layer=True
)

optimizer = AdamW(learning_rate=2.0863364780543777e-05)
loss_fn = SparseCategoricalCrossentropy(from_logits=False, ignore_class=-100)
f1_score_callback = F1ScoreCallback(test, label_names)

model.compile(
    optimizer=optimizer, 
    loss=loss_fn
)

history = model.fit(
    train,
    epochs=10,
    callbacks=[f1_score_callback]
)

val_f1_score = max(history.history['val_f1_score'])
best_epoch = history.history['val_f1_score'].index(val_f1_score)

print(f'Best f1 score is {val_f1_score} on epoch {best_epoch}')

## Concat + FFN

In [None]:
class AEConcatPosTag(Model):

    def __init__(
        self,
        bert_general_domain,
        bert_specific_domain,
        dropout_rate,
        first_layer_units,
        second_layer_units=None,
        third_layer_units=None,
        n_classes=3,
        train_bert_layer=False,
    ):
        super(AEConcatPosTag, self).__init__()
        self.bert_general_domain = bert_general_domain
        self.bert_specific_domain = bert_specific_domain

        self.bert_general_domain.trainable = train_bert_layer
        self.bert_specific_domain.trainable = train_bert_layer

        self.concat = layers.Concatenate()
        self.dropout = layers.Dropout(dropout_rate)

        self.dense1 = layers.Dense(first_layer_units, activation='relu')
        self.dropout1 = layers.Dropout(dropout_rate)

        self.second_layer_units = second_layer_units
        if second_layer_units:
            self.dense2 = layers.Dense(second_layer_units, activation='relu')
            self.dropout2 = layers.Dropout(dropout_rate)

        self.third_layer_units = third_layer_units
        if third_layer_units:
            self.dense3 = layers.Dense(third_layer_units, activation='relu')
            self.dropout3 = layers.Dropout(dropout_rate)

        self.classifier = layers.Dense(
            n_classes, activation='softmax'
        )

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        attention_mask = inputs['attention_mask']

        output_general_domain = self.bert_general_domain([
            input_ids, token_type_ids, attention_mask
        ])

        output_specific_domain = self.bert_specific_domain([
            input_ids, token_type_ids, attention_mask
        ])

        concatened = tf.concat([
            output_general_domain['last_hidden_state'],
            output_specific_domain['last_hidden_state'],
        ], axis=-1)

        concatened = self.dropout(concatened)

        logits = self.dense1(concatened)
        logits = self.dropout1(logits)

        if self.second_layer_units:
            logits = self.dense2(logits)
            logits = self.dropout2(logits)

        if self.third_layer_units:
            logits = self.dense3(logits)
            logits = self.dropout3(logits)         

        logits = self.classifier(logits)

        return logits
    
    def train_step(self, data):
        data = expand_data(data)
        inputs = {
            k: data[k] for k in (
                'input_ids', 
                'token_type_ids', 
                'attention_mask',
            )
        }
        labels = data['aspect_labels']

        with tf.GradientTape() as tape:
            logits = self(inputs, training=True)
            loss = self.compiled_loss(
                labels, 
                logits, 
                regularization_losses=self.losses
            )

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        self.compiled_metrics.update_state(labels, logits)
        return {m.name: m.result() for m in self.metrics}

In [None]:
model = AEConcatPosTag(
    bert_general_domain=TFBertModel.from_pretrained(general_domain_checkpoint),
    bert_specific_domain=TFBertModel.from_pretrained(specific_domain_checkpoint),
    dropout_rate=0.30000000000000004,
    first_layer_units=768,
    second_layer_units=1536,
    third_layer_units=512,
    n_classes=3,
    train_bert_layer=True
)

optimizer = AdamW(learning_rate=2.006300676202351e-05)
loss_fn = SparseCategoricalCrossentropy(from_logits=False, ignore_class=-100)
f1_score_callback = F1ScoreCallback(test, label_names)

model.compile(
    optimizer=optimizer, 
    loss=loss_fn
)

history = model.fit(
    train,
    epochs=23,
    callbacks=[f1_score_callback]
)

val_f1_score = max(history.history['val_f1_score'])
best_epoch = history.history['val_f1_score'].index(val_f1_score)

print(f'Best f1 score is {val_f1_score} on epoch {best_epoch}')