In [None]:
from datasets import load_dataset
import keras_tuner
import nltk
import random
import tensorflow as tf
from tensorflow import keras
from transformers import (
    AutoTokenizer, 
    BertConfig, 
    DataCollatorForLanguageModeling,
    TFBertForPreTraining
)

nltk.download("punkt")

In [None]:
BLOCK_SIZE = 512
NSP_PROB = 0.50
SHORT_SEQ_PROB = 0.1
MAX_LENGTH = 512 

MLM_PROB = 0.15

MAX_EPOCHS = 10

TRAIN_BATCH_SIZE = 8

MODEL_CHECKPOINT = 'neuralmind/bert-base-portuguese-cased'

In [None]:
dataset = load_dataset(
    "text", 
    data_files='../corpus/preprocessed/all_reviews_opt.txt'
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
max_num_tokens = BLOCK_SIZE - tokenizer.num_special_tokens_to_add(pair=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
max_num_tokens = BLOCK_SIZE - tokenizer.num_special_tokens_to_add(pair=True)

def prepare_train_features(examples):
    """Function to prepare features for NSP task

    Arguments:
      examples: A dictionary with 1 key ("text")
        text: List of raw documents (str)
    Returns:
      examples:  A dictionary with 4 keys
        input_ids: List of tokenized, concatnated, and batched
          sentences from the individual raw documents (int)
        token_type_ids: List of integers (0 or 1) corresponding
          to: 0 for senetence no. 1 and padding, 1 for sentence
          no. 2
        attention_mask: List of integers (0 or 1) corresponding
          to: 1 for non-padded tokens, 0 for padded
        next_sentence_label: List of integers (0 or 1) corresponding
          to: 1 if the second sentence actually follows the first,
          0 if the senetence is sampled from somewhere else in the corpus
    """

    # Remove un-wanted samples from the training set
    examples["document"] = [
        d.strip() for d in examples["text"] if len(d) > 0 and not d.startswith(" =")
    ]
    # Split the documents from the dataset into it's individual sentences
    examples["sentences"] = [
        nltk.tokenize.sent_tokenize(document) for document in examples["document"]
    ]
    # Convert the tokens into ids using the trained tokenizer
    examples["tokenized_sentences"] = [
        [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent)) for sent in doc]
        for doc in examples["sentences"]
    ]

    # Define the outputs
    examples["input_ids"] = []
    examples["token_type_ids"] = []
    examples["attention_mask"] = []
    examples["next_sentence_label"] = []

    for doc_index, document in enumerate(examples["tokenized_sentences"]):

        current_chunk = []  # a buffer stored current working segments
        current_length = 0
        i = 0

        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted
        # computation. However, we *sometimes*
        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
        # sequences to minimize the mismatch between pretraining and fine-tuning.
        # The `target_seq_length` is just a rough target however, whereas
        # `block_size` is a hard limit.
        target_seq_length = max_num_tokens

        if random.random() < SHORT_SEQ_PROB:
            target_seq_length = random.randint(2, max_num_tokens)

        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document) - 1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1
                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)

                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])

                    tokens_b = []

                    if len(current_chunk) == 1 or random.random() < NSP_PROB:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # This should rarely go for more than one iteration for large
                        # corpora. However, just to be careful, we try to make sure that
                        # the random document is not the same as the document
                        # we're processing.
                        for _ in range(10):
                            random_document_index = random.randint(
                                0, len(examples["tokenized_sentences"]) - 1
                            )
                            if random_document_index != doc_index:
                                break

                        random_document = examples["tokenized_sentences"][
                            random_document_index
                        ]
                        random_start = random.randint(0, len(random_document) - 1)
                        for j in range(random_start, len(random_document)):
                            tokens_b.extend(random_document[j])
                            if len(tokens_b) >= target_b_length:
                                break
                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    input_ids = tokenizer.build_inputs_with_special_tokens(
                        tokens_a, tokens_b
                    )
                    # add token type ids, 0 for sentence a, 1 for sentence b
                    token_type_ids = tokenizer.create_token_type_ids_from_sequences(
                        tokens_a, tokens_b
                    )

                    padded = tokenizer.pad(
                        {"input_ids": input_ids, "token_type_ids": token_type_ids},
                        padding="max_length",
                        max_length=MAX_LENGTH,
                    )

                    examples["input_ids"].append(padded["input_ids"])
                    examples["token_type_ids"].append(padded["token_type_ids"])
                    examples["attention_mask"].append(padded["attention_mask"])
                    examples["next_sentence_label"].append(1 if is_random_next else 0)
                    current_chunk = []
                    current_length = 0
            i += 1

    # We delete all the un-necessary columns from our dataset
    del examples["document"]
    del examples["sentences"]
    del examples["text"]
    del examples["tokenized_sentences"]

    return examples

In [None]:
tokenized_dataset = dataset.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=["text"], 
    num_proc=1,
)

In [None]:
collater = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=MLM_PROB, 
    return_tensors="tf"
)

In [None]:
train = tokenized_dataset["train"].to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=["labels", "next_sentence_label"],
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    collate_fn=collater,
)

In [None]:
def build_bert_model(hp):
    """Function to build a BERT model. `hp` is an instance of `keras_tuner.HyperParameters`"""

    config = BertConfig(
        vocab_size=tokenizer.vocab_size,  
        hidden_size=hp.Choice('hidden_size', values=[256, 512, 768]),  
        num_hidden_layers=hp.Choice(
            'num_hidden_layers', values=[6, 8, 10, 12]
        ),
        num_attention_heads=hp.Choice(
            'num_attention_heads', values=[4, 12, 16]
        ),
        intermediate_size=hp.Choice(
            'intermediate_size', values=[1024, 2048, 3072]
        ),
        hidden_dropout_prob=hp.Float(
            'hidden_dropout_prob', min_value=0.1, max_value=0.5, step=0.1
        ),
    )

    model = TFBertForPreTraining(config)
    optimizer = keras.optimizers.experimental.AdamW(
        learning_rate=hp.Float(
            'learning_rate', min_value=2e-5, max_value=1e-3, sampling='log'
        )
    )
    model.compile(optimizer=optimizer)

    return model

In [None]:
class EarlyStoppingPercent(tf.keras.callbacks.Callback):
    """Callback to stop training when the loss stops decreasing by a certain percentage."""
    def __init__(
            self, 
            patience=3, 
            min_percent_change=0.01
        ):
        super(EarlyStoppingPercent, self).__init__()
        self.patience = patience
        self.min_percent_change = min_percent_change
        self.best_loss = float("inf")
        self.wait = 0
        self.best_weights = None

    def on_epoch_end(self, epoch, logs=None):
        current_loss = logs.get("loss")

        if self.best_loss == float("inf"):
            self.best_loss = current_loss
            return

        percent_change = (self.best_loss - current_loss) / self.best_loss

        if percent_change > self.min_percent_change:
            self.best_loss = current_loss
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                print(f" - Epoch {epoch + 1}: Early stopping triggered.")

        print(f' - Best value is {self.best_loss}.')

early_stopping = EarlyStoppingPercent(patience=3, min_percent_change=0.1)

In [None]:
tuner = keras_tuner.BayesianOptimization(
    hypermodel=build_bert_model,
    objective='loss',
    max_trials=20,
    overwrite=True,
    directory=f'/aspect_extraction/notebooks/results/bo_search_{TRAIN_BATCH_SIZE}',
    project_name=f'/aspect_extraction/notebooks/results/bert_mod_{TRAIN_BATCH_SIZE}'
)

tuner.search(
    train,
    epochs=10,
    callbacks=[early_stopping]
)

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
params = (
    'hidden_size',
    'num_hidden_layers',
    'num_attention_heads',
    'intermediate_size',
    'hidden_dropout_prob',
    'learning_rate',
)

for param in params:
    print(f'The best value for {param} is {best_hps.get(param)}.')