# Experimentación no determínistica con modelos LSTM y BiLSTM

El objetivo de este notebook consiste en incluir el código necesario para **entrenar y validar múltiples modelos** con los que posteriormente conocer su calidad calculando la **media de valores de accuracy y AUC** comprobando así si las modificaciones realizadas aportan calidad a su capacidad predictiva.

Como el propósito de este notebook consiste en ejecutarlo dentro de un software en la nube, como Google Colab, será necesario disponer de los **conjuntos de entrenamiento y validación en ficheros** para ahorrar tiempo de computación y recursos, centrandose únicamente en los modelos. Adicionalmente se deberá disponer de un fichero *requirements.txt* para **instalar las librerías** necesarias dentro del entorno cloud escogido.

In [None]:
# Install libraries based on a requirements file
!pip install -r requirements.txt

# Import required libraries for data
import numpy as np
import pandas as pd

# Import required libraries for building LSTM and BiLSTM models
from keras.models import Model
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import LSTM, Activation, Dense, Input, Embedding, Bidirectional

# Import time library to measure the time spent on the experiments
import time

In [None]:
# Read the datasets
train_df = pd.read_csv(<path to the train dataset>)
test_df = pd.read_csv(<path to the test dataset>)

# Filter the datasets by language
train_df = train_df[train_df['language'] == 'en'] # 'es'
test_df = test_df[test_df['language'] == 'en'] # 'es'

print(f'Number of train samples: {train_df.shape[0]}')
print(f'Number of test samples: {test_df.shape[0]}')

# Variables to store the validation metrics per each experiment
train_accuracy_values = []
test_accuracy_values = []
train_auc_values = []
test_auc_values = []

In [None]:
def get_train_test_matrix(
    train_df: pd.DataFrame, test_df: pd.DataFrame, 
    max_n_words: int, sequence_len: int):
    '''
    Encodes the provided train and test datasets to convert
    them into numeric vector sequences so they could be
    encoded again using word embeddings.

    Parameters
    ----------
    train_df : Pandas dataframe
        It contains the training data samples.
    test_df: Pandas dataframe
        It contains the testing data samples.
    max_n_words : int
        The number of words to store in memory.
    sequence_len : int
        The fixed size of the vector sequences.
    
    Returns
    -------
    A dictionary with the created tokenizer, the train and 
    test numeric sequence vectors plus two lists with the
    train and test class labels.
    '''
    # Create a tokenizer based on train texts
    tokenizer = Tokenizer(num_words=max_n_words)
    tokenizer.fit_on_texts(train_df['clean_text'].astype('str'))

    # Transform each train text into a numeric sequence
    train_sequences = tokenizer.texts_to_sequences(train_df['clean_text'].astype('str'))

    # Transform each train numeric sequence into a 2D vector
    train_matrix = pad_sequences(
        sequences=train_sequences, 
        maxlen=sequence_len)

    # Tokenize the test documents using the trained tokenizer
    test_sequences = tokenizer.texts_to_sequences(test_df['clean_text'].astype('str'))

    # Transform each test numeric sequence into a 2D vector
    test_matrix = pad_sequences(
        sequences=test_sequences,
        maxlen=sequence_len)

    return {
        'tokenizer': tokenizer,
        'train_matrix': train_matrix,
        'train_labels': list(train_df['task1'].values),
        'test_matrix': test_matrix,
        'test_labels': list(test_df['task1'].values)
    }


def get_embedding_matrix(embedding_file: str, tokenizer: Tokenizer, sequence_len: int):
    '''
    Loads the embeddings of the provided file to then encode
    the vocabulary stored in the provided tokenizer creating 
    the embedding vectors with the specified fixed size.

    Parameters
    ----------
    embedding_file : str
        The path to the file which contains the embeddings to use.
    tokenizer : Keras Tokenizer
        A Keras Tokenizer object that contains the vocabulary 
        from the train samples.
    sequence_len : int
        The fixed size of the embedding vector.
    
    Returns
    -------
    A Numpy matrix with the encoded vocabulary as embeddings.
    '''
    # Load the embeddings stored in a TXT file
    embedding_file = open(embedding_file)

    # Store each word with its embeddings
    embeddings_index = {
        line.split()[0]:np.asarray(line.split()[1:], dtype='float32') 
        for line in embedding_file
    }

    # Initialize the embedding matrix with zeros
    embedding_matrix = np.zeros(shape=(len(tokenizer.word_index)+1, sequence_len))

    # Complete the matrix with the prior loaded embeddings
    for word, i in tokenizer.word_index.items():
        # Search for the embeddings of each word
        embedding_vector = embeddings_index.get(word)

        # Words not found will be zeros
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix    


def validate_lstm_model(
    model: Model, 
    train_matrix: np.ndarray, train_labels: list, 
    test_matrix: np.ndarray, test_labels: list):
    '''
    Computes the validation metrics of accuracy and AUC based
    on the provided trained LSTM model over the train and test
    data.

    Parameters
    ----------
    model : Keras model
        A Keras LSTM trained model to validate.
    train_matrix : Numpy matrix
        A numeric sequence vectors that represent the encoded 
        train documents.
    train_labels : list
        A list of numbers with the encoded train class labels.
    test_matrix : Numpy matrix
        A numeric sequence vectors that represent the encoded 
        test documents.
    test_labels : list
        A list of numbers with the encoded test class labels.
    '''
    # Compute the train validation metrics
    train_acc = model.evaluate(
        x=train_matrix, 
        y=np.array(train_labels))
    
    train_accuracy_values.append(train_acc[1])
    train_auc_values.append(train_acc[2])

    # Compute the test validation metrics
    test_acc = model.evaluate(
        x=test_matrix, 
        y=np.array(test_labels))
    
    test_accuracy_values.append(test_acc[1])
    test_auc_values.append(test_acc[2])

In [None]:
# Experiment settings
N_ITERATIONS = 30

# Tokenizer and embedding settings
MAX_N_WORDS = 1000
EMBEDDING_FILE_PATH = <path to an embedding file>
SEQUENCE_MAX_LEN = <int and depends on the number of vectors of the embedding file>

# Training settings
BATCH_SIZE = 16
N_EPOCHS = 100
VALID_RATE = 0.2
MODEL_CALLBACKS = [EarlyStopping(
    monitor='val_auc',
    min_delta=0.001,
    patience=15,
    restore_best_weights=True)]
LOSS_FUNCTION = 'binary_crossentropy'
OPTIMIZER = 'adam'
VALID_METRICS = ['accuracy', 'AUC']

## START MEASURING TIME
start = time.time()

for i in range(0, N_ITERATIONS):
    lstm_data = get_train_test_matrix(
        train_df=train_df.sample(frac=1, axis=1),
        test_df=test_df,
        max_n_words=MAX_N_WORDS,
        sequence_len=SEQUENCE_MAX_LEN
    )

    lstm_embedding_matrix = get_embedding_matrix(
        embedding_file=EMBEDDING_FILE_PATH,
        tokenizer=lstm_data['tokenizer'],
        sequence_len=SEQUENCE_MAX_LEN
    )

    # Model ARCHITECTURE
    input_layer = Input(
        name='inputs',
        shape=[SEQUENCE_MAX_LEN])

    ## Embedding layer: pre-trained embeddings
    layer = Embedding(
        input_dim=len(lstm_data['tokenizer'].word_index)+1,
        output_dim=SEQUENCE_MAX_LEN,
        weights=[lstm_embedding_matrix],
        input_length=MAX_N_WORDS,
        trainable=False)(input_layer)

    ################ PUT HERE THE DESIRED ARCHITECTURE ################
    # E.g.: a two bidirectional LSTM with two layers of 128 neurons
    layer = Bidirectional(LSTM(units=128, return_sequences=True))(layer)
    layer = Bidirectional(LSTM(units=128))(layer)

    layer = Dense(
        name='output',
        units=1)(layer)

    output_layer = Activation(activation='sigmoid')(layer)
    ################ PUT HERE THE DESIRED ARCHITECTURE ################

    # Create an object for the model
    lstm_model = Model(
        inputs=input_layer,
        outputs=output_layer)

    # Compile the model 
    lstm_model.compile(
        loss=LOSS_FUNCTION,
        optimizer=OPTIMIZER,
        metrics=VALID_METRICS)

    # Train the built model
    lstm_model.fit(
        x=lstm_data['train_matrix'], 
        y=np.array(lstm_data['train_labels']),
        batch_size=BATCH_SIZE,
        epochs=N_EPOCHS,
        validation_split=VALID_RATE,
        callbacks=MODEL_CALLBACKS,
        verbose=0)

    # Evaluate the trained LSTM model over train and test datasets
    validate_lstm_model(
        model=lstm_model,
        train_matrix=lstm_data['train_matrix'],
        train_labels=lstm_data['train_labels'],
        test_matrix=lstm_data['test_matrix'],
        test_labels=lstm_data['test_labels']
    )

## FINISH MEASURING TIME
end = time.time()

# Calculate the average of each metric
print(f'Avg train acc: {round(sum(train_accuracy_values)/len(train_accuracy_values), 3)}') 
print(f'Avg train auc: {round(sum(train_auc_values)/len(train_auc_values), 3)}') 
print(f'Avg test acc: {round(sum(test_accuracy_values)/len(test_accuracy_values), 3)}') 
print(f'Avg test auc: {round(sum(test_auc_values)/len(test_auc_values), 3)}') 

# Add the execution time
print(f'Total time: {round(((end-start)/60), 3)} min') 