In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from numpy.random import seed
from itertools import chain
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional

In [None]:
data = pd.read_csv('ner.csv', encoding="latin1", on_bad_lines='skip')
data.columns

In [None]:
cols_to_keep = ['sentence_idx', 'word', 'tag']
data = data[cols_to_keep]
data.head(1)

In [None]:
data.isna().sum()

In [None]:
data = data.fillna(method = 'ffill')

In [None]:
words = list(set(data["word"].values))
words.append("ENDPAD")
num_words = len(words)

print(f"Total number of unique words in dataset: {num_words}")

In [None]:
tags = list(set(data["tag"].values))
num_tags = len(tags)
print("List of tags: " + ', '.join([tag for tag in tags]))
print(f"Total Number of tags {num_tags}")

In [None]:
class GetSentence(object):
    def __init__(self,data):
        self.n_sent = 1
        self.data = data
        agg_func = lambda s:[(w, t) for w, t in zip(s["word"].values.tolist(),
                                                    s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
getter = GetSentence(data)
sentence = getter.sentences
sentence[10]

In [None]:
word_idx = {w : i + 1 for i ,w in enumerate(words)}
tag_idx =  {t : i for i ,t in enumerate(tags)}

In [None]:
max_len = 50
X = [[word_idx[w[0]] for w in s] for s in sentence]
X = pad_sequences(maxlen=max_len, sequences=X, padding='post', value=num_words - 1)

In [None]:
y = [[tag_idx[w[1]] for w in s] for s in sentence]
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', value=tag_idx['O'])
y = [to_categorical(i, num_classes=num_tags) for i in  y]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
len(x_train), len(x_test), len(y_train), len(y_test)

In [None]:
input_word = Input(shape=(max_len,))
model = Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(num_tags, activation='softmax'))(model)
model = Model(input_word, out)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x_train, np.array(y_train), batch_size=64, verbose=1, epochs=3, validation_split=0.2)

# Second test

In [None]:
import pandas as pd
import tensorflow as tf

from ast import literal_eval
from collections import defaultdict
from datasets import Dataset, DatasetDict
from src import utils
from tensorflow import keras
from tensorflow.keras import layers
from transformers import create_optimizer, AutoTokenizer, TFBertModel

In [None]:
def tokenize_and_align_labels(examples: dict) -> Dataset:
    """Tokenize and align labels with subword tokens.

    Args:
        examples: Pre-token.

    Returns:
        Tokens with labels.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        max_length=100,
        pad_to_max_length=True
    )
    all_labels = examples['aspect_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(utils.align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs


class BiLSTM(layers.Layer):
    def __init__(self, units, **kwargs):
        super(BiLSTM, self).__init__(**kwargs)
        self.units = units
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(units, return_sequences=True))

    def call(self, inputs):
        x = self.bilstm(inputs)
        return x

In [None]:
# lendo os dados
data_ds = pd.read_csv('../datasets/processed/tv_stratified.csv')

# mudando o formato das colunas
for col in ('tokens', 'aspect_tags'):
    data_ds[col] = data_ds[col].apply(literal_eval)

In [None]:
model_chekpoint = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_chekpoint)

In [None]:
# tag mapping
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}
label2id = {v: k for k, v in id2label.items()}
label_names = ['O', 'B-ASP', 'I-ASP']

In [None]:
data = Dataset.from_pandas(data_ds[['tokens', 'aspect_tags']])
data = data.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=data.column_names
)

In [None]:
input_data = {
    'input_ids': tf.convert_to_tensor(data['input_ids'], dtype=tf.int32),
    'attention_mask': tf.convert_to_tensor(data['attention_mask'], dtype=tf.int32),
    'token_type_ids': tf.convert_to_tensor(data['token_type_ids'], dtype=tf.int32),
}
labels = tf.convert_to_tensor(data['labels'], dtype=tf.int32)

In [None]:
# Load the BERT model and tokenizer
bert_model = TFBertModel.from_pretrained(model_chekpoint)

In [None]:
# camadas
inputs = [layers.Input(shape=(None,), dtype=tf.int32, name=name) for name in ('input_ids', 'attention_mask', 'token_type_ids')]
embedding = bert_model(inputs)[0]
bilstm = BiLSTM(units=32)(embedding)
outputs = layers.TimeDistributed(layers.Dense(units=3, activation='softmax'))(bilstm)
model = keras.Model(inputs, outputs)

In [None]:
keras.utils.plot_model(model)

In [None]:
# Define a custom loss function that ignores padding tokens
def custom_loss(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

# Compile the model with the custom loss function
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=custom_loss,
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
model.train_on_batch(input_data, labels)