# Experimentación no determínistica con modelos BERT

El objetivo de este notebook consiste en incluir el código necesario para **entrenar y validar múltiples modelos** con los que posteriormente conocer su calidad calculando la **media de valores de accuracy y AUC** comprobando así si las modificaciones realizadas aportan calidad a su capacidad predictiva.

Como el propósito de este notebook consiste en ejecutarlo dentro de un software en la nube, como Google Colab, será necesario disponer de los **conjuntos de entrenamiento y validación en ficheros** para ahorrar tiempo de computación y recursos, centrandose únicamente en los modelos. Adicionalmente se deberá disponer de un fichero *requirements.txt* para **instalar las librerías** necesarias dentro del entorno cloud escogido.

In [None]:
# Install libraries based on a requirements file
!pip install -r requirements.txt

# Import required libraries for data
import numpy as np
import pandas as pd

# Import required libraries for building BERT models
import tensorflow as tf
from transformers import *

# Import a library to calculate the validation metrics
from sklearn import metrics

# Import time library to measure the time spent on the experiments
import time

In [None]:
# Read the datasets
train_df = pd.read_csv(<path to the train dataset>)
test_df = pd.read_csv(<path to the test dataset>)

# Filter the datasets by language
train_df = train_df[train_df['language'] == 'en'] # 'es'
test_df = test_df[test_df['language'] == 'en'] # 'es'

print(f'Number of train samples: {train_df.shape[0]}')
print(f'Number of test samples: {test_df.shape[0]}')

# Variables to store the validation metrics per each experiment
train_accuracy_values = []
test_accuracy_values = []
train_auc_values = []
test_auc_values = []

In [None]:
def tokenize_texts(texts: list, labels: list, bert_tokenizer: BertTokenizer):
    '''
    A function that tokenizes the provided list of texts using a pre-trained
    Bert tokenizer to encode the documents to get them ready for BERT models.

    Parameters
    ----------
    texts : list
        A list of strings with the documents to encode as BERT tokens.
    labels : int
        A list of integers with the class labels to encode as a Numpy array.
    bert_tokenizer : BertTokenizer
        A pre-trained BERT tokenizer to use to encode the documents for BERT models.
    
    Returns
    -------
    A dictionary with the tokenized texts, the created attention
    masks for the BERT models as well as the transformed class labels
    within a Numpy array.
    '''
    tokenized_texts = []
    attention_masks = []

    # Tokenize the provided texts using BERT tokenizer
    for doc in texts:
        bert_inp = bert_tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_attention_mask=True)

        # Get the tokenized texts along with the attention masks
        tokenized_texts.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    return {
        'tokenized_texts': np.asarray(tokenized_texts),
        'attention_masks': np.asarray(attention_masks),
        'labels': np.asarray(labels),
    }

def validate_bert_model(
    train_labels: list, train_pred_labels: list,
    test_labels: list, test_pred_labels: list):
    '''
    Computes the validation metrics of accuracy and AUC based
    on the provided train and test labels, as well as the train
    and test predicted labels by a BERT model.

    Parameters
    ----------
    train_labels : list
        A list of integers with the real train class labels.
    train_pred_labels : list
        A list of integers with the predicted train class labels.
    test_labels : list
        A list of integers with the real test class labels.
    test_pred_labels : list
        A list of integers with the predicted test class labels.
    '''
    # Calculate the accuracy values for train and test texts
    train_accuracy_values.append(metrics.accuracy_score(
        train_labels, 
        train_pred_labels))

    test_accuracy_values.append(metrics.accuracy_score(
        test_labels, 
        test_pred_labels))

    # Calculate the AUC values for train and test texts
    train_auc_values.append(metrics.roc_auc_score(
        train_labels, 
        train_pred_labels))

    test_auc_values.append(metrics.roc_auc_score(
        test_labels, 
        test_pred_labels))

In [None]:
# Experiment settings
N_ITERATIONS = 30

# BERT tokenizer settings
PRETRAINED_BERT_MODEL = 'bert-base-uncased' # en
                        # 'dccuchile/bert-base-spanish-wwm-uncased' # es
BERT_TOKENIZER_OBJ = BertTokenizer.from_pretrained(PRETRAINED_BERT_MODEL)

# Train settings
NUM_LABELS = 2
BATCH_SIZE = 16
NUM_EPOCHS = 100
VALID_RATE = 0.2
MODEL_CALLBACKS = [tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.01,
    patience=15,
    restore_best_weights=True)]
LOSS_FUNCTION = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=2e-5)
VALID_METRICS = ['accuracy']

## START MEASURING TIME
start = time.time()

for i in range(0, N_ITERATIONS):
    # Shuffle train dataset
    train_df = train_df.sample(frac=1, axis=1)

    bert_encoded_train_data = tokenize_texts(
        texts=list(train_df['clean_text'].values),
        labels=list(train_df['task1'].values),
        bert_tokenizer=BERT_TOKENIZER_OBJ)

    bert_encoded_test_data = tokenize_texts(
        texts=list(test_df['clean_text'].values),
        labels=list(test_df['task1'].values),
        bert_tokenizer=BERT_TOKENIZER_OBJ)
    
    # Create a new pretrained BERT model
    bert_model = TFBertForSequenceClassification.from_pretrained(
        PRETRAINED_BERT_MODEL, 
        num_labels=NUM_LABELS)

    bert_model.compile(
        loss=LOSS_FUNCTION,
        optimizer=OPTIMIZER,
        metrics=VALID_METRICS)
    
    # Train the BERT model
    bert_model.fit(
        [
            bert_encoded_train_data['tokenized_texts'], 
            bert_encoded_train_data['attention_masks']
        ],
        bert_encoded_train_data['labels'],
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        validation_split=VALID_RATE,
        callbacks=MODEL_CALLBACKS)

    # Create the predictions over the train dataset
    train_preds = bert_model.predict(
    [
        bert_encoded_train_data['tokenized_texts'], 
        bert_encoded_train_data['attention_masks']
    ])
    train_pred_labels = np.argmax(train_preds.logits, axis=1)

    # Create the predictions over the test dataset
    test_preds = bert_model.predict(
    [
        bert_encoded_test_data['tokenized_texts'], 
        bert_encoded_test_data['attention_masks']
    ])
    test_pred_labels = np.argmax(test_preds.logits, axis=1)

    # Evaluate the model over train and test datasets
    validate_bert_model(
        train_labels=bert_encoded_train_data['labels'],
        train_pred_labels=train_pred_labels,
        test_labels=bert_encoded_test_data['labels'],
        test_pred_labels=test_pred_labels
    )

## FINISH MEASURING TIME
end = time.time()

# Calculate the average of each metric
print(f'Avg train acc: {round(sum(train_accuracy_values)/len(train_accuracy_values), 3)}') 
print(f'Avg train auc: {round(sum(train_auc_values)/len(train_auc_values), 3)}') 
print(f'Avg test acc: {round(sum(test_accuracy_values)/len(test_accuracy_values), 3)}') 
print(f'Avg test auc: {round(sum(test_auc_values)/len(test_auc_values), 3)}') 

# Add the execution time
print(f'Total time: {round(((end-start)/60), 3)} min') 