# LSTM-GloVe Model for Sentiment Analysis in text data

En este notebook se describe el proceso de carga, preprocesamiento, embedding, construcción y entrenamiento de un modelo que emplea LSTM y GloVe
para el set de datos de texto.

In [1]:
DATA_PATH = '../../data/text/cleaned/final.csv'
DATA_REDUCED_PATH = '../../data/text/cleaned/reduced_final_eng_clean.csv'
DATA_OUTPUT_PATH = '../../data/text/cleaned/preprocessed/final.pkl'
CHUNK_SIZE = 10**6
COLUMNS = ['text','priority']
words = {}
MAX_LEN = 0

# PRE PROCESAMIENTO DE DATOS

## Funciones

### Preprocesamiento
Es necesario remover de nuestros datos información irrelevante como etiquetas, puntución, números y caracteres especiales.

In [2]:
import re

TAG_RE = re.compile(r'@[^> ]+')

def remove_at_sign(sentence: str):
    '''
    Replaces '@' from and input string for an empty space
    :param sentence: String that contains @
    :return: sentence without @
    '''

    return TAG_RE.sub('', sentence)

In [3]:
import langid
from deep_translator import GoogleTranslator

def translate_sentence(sentence: str):
    """
    Translate a sentence to english if it's in different language.
    :param sentence: The string/sentence to translate
    :return: The original sentence in english
    """
    lang = langid.classify(sentence)[0]
    if lang != 'en' and len(sentence) < 5000:
        sentence = GoogleTranslator(source='auto').translate(sentence)
    return sentence

ModuleNotFoundError: No module named 'langid'

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

def preprocess_text(sentence: str):
    '''
    Cleans up a sentence leaving only 2 or more non-stopwords composed of upper and lowercase
    :param sentence: String to be cleaned
    :return: sentence without numbers, special chars and long stopwords
    '''

    cleaned_sentence = sentence.lower()
    cleaned_sentence = remove_at_sign(cleaned_sentence)
    cleaned_sentence = re.sub('[^a-zA-Z]', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+[a-zA-Z]\s', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+', ' ', cleaned_sentence)

    #Translate
    #cleaned_sentence = translate_sentence(cleaned_sentence)

    #Removal of stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s')
    cleaned_sentence = pattern.sub('', cleaned_sentence)

    return cleaned_sentence

In [None]:
import numpy as np

def add_to_dict(dictionary, filename):
    """
    Agrega elementos a un diccionario a partir de un archivo.

    :param dictionary: Diccionario al que se agregarán los elementos.
    :type dictionary: dict
    :param filename: Nombre del archivo que contiene los elementos.
    :type filename: str
    """
    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.split(' ')

            try:
                dictionary[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue


### Tokenización y Lematización
Una vez cargada la información de los tokens de GloVe se procede a tokenizar y lematizar cada
una de las oraciones en nuestro set de datos.

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def sentence_to_token_list(sentence: str):
    """
    Convierte una oración en una lista de tokens útiles.

    :param sentence: Oración a convertir.
    :type sentence: str
    :return: Lista de tokens útiles.
    :rtype: list
    """
    # Tokenización de la oración
    tokens = tokenizer.tokenize(sentence)

    # Lemmatización de los tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Selección de los tokens útiles
    useful_tokens = [token for token in lemmatized_tokens if token in words]

    return useful_tokens


Con el token anterior, el cual sabemos se puede representar por medio de uno de los tokens almacenados en `words`, entonces pasamos a la representación de estos:

In [None]:
def sentence_to_words_vectors(sentence: str, word_dict=words):
    """
    Convierte una oración en una matriz de vectores de palabras.

    :param sentence: Oración a convertir.
    :type sentence: str
    :param word_dict: Diccionario de palabras y vectores. Por defecto, utiliza 'words'.
    :type word_dict: dict
    :return: Matriz de vectores de palabras.
    :rtype: numpy.ndarray
    """
    # Obtener los tokens procesados de la oración
    processed_tokens = sentence_to_token_list(sentence)

    # Obtener los vectores correspondientes a los tokens procesados
    vectors = []
    for token in processed_tokens:
        if token in word_dict:
            token_vector = word_dict[token]
            vectors.append(token_vector)

    # Convertir la lista de vectores en una matriz NumPy
    array = np.array(vectors, dtype=float)

    # Actualizar la variable global MAX_LEN si es necesario
    global MAX_LEN
    if MAX_LEN < array.shape[0]:
        MAX_LEN = array.shape[0]

    return array


## Preprocesamiento

In [None]:
import pandas as pd
import pickle

In [None]:
import pandas as pd

def process_data(df: pd.DataFrame):
    """
    Procesa los datos en un DataFrame dado.

    :param df: DataFrame con los datos a procesar.
    :type df: pd.DataFrame
    :return: DataFrame procesado.
    :rtype: pd.DataFrame
    """
    # Eliminar la columna 'tweet' (comentada en el código original)
    # data.drop(columns=['tweet'], inplace=True)

    # Limpiar el texto en la columna 'text'
    print('-'*20, '\nCleaning text')
    df['text'] = df['text'].apply(preprocess_text)

    # Tokenizar el texto en la columna 'text' y convertirlo en vectores de palabras
    print('-'*20, '\nTokenizing text')
    df['text'] = df['text'].apply(lambda sentence: sentence_to_words_vectors(sentence))

    return df


In [None]:
# add_to_dict(words, './GloVe/glove.6B/glove.6B.50d.txt')

# first = True
# count = 1
# data_array = []
# for chunk in pd.read_csv(DATA_PATH, chunksize=10**5,nrows=10**6):
#     print("PROCESSING " + str(count))
#     data = process_data(chunk)
#     data_array.append(data)
#     count = count+1

Dado que las matrices de vectores de cada oración tienen un número diferente de filas debido a que cada oración cuenta con un número diferente de palabas. Es necesario identificar el tamaño máximo de los textos que se tienen para su "estandarización":

Dado que el tamaño máximo es 112, entonces se llevarán todas las matrices a la forma `(35, 50)`. Los valores faltantes para cada vector serán 0s en su inicio.

In [None]:
# import tensorflow as tf
# for i in enumerate(data_array):
#     data_array[i] = tf.keras.utils.pad_sequences(data_array[i], maxlen=MAX_LEN, dtype='float16')

## Data Generator

In [None]:
import keras
from keras.utils import Sequence
import numpy as np

class DataGenerator(Sequence):
    def __init__(self, data_array, batch_size=32, shuffle=True):
        'Initialization'
        self.df_array:pd.DataFrame = data_array
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def on_epoch_end(self):
        'Updates indexes after each epoch'

    def __len__(self):
        'Denotes the number of batches per epoch'
        return sum(len(df) for df in self.data_array)

    def __getitem__(self, index):
        'Generate one batch of data'



### Opcion 2

In [None]:
add_to_dict(words, './GloVe/glove.6B/glove.6B.50d.txt')

In [None]:
import tensorflow as tf

import random

num_lines = sum(1 for l in open(DATA_REDUCED_PATH))
size = 10 ** 6
skip_idx = random.sample(range(1, num_lines), num_lines - size)
# Leemos el archivo, saltando las filas seleccionadas.
data = pd.read_csv(DATA_REDUCED_PATH, skiprows=skip_idx)

In [None]:
data = process_data(data)

## División Entrenamiento-Validación-Test

In [None]:
X = tf.keras.utils.pad_sequences(data['text'], maxlen=MAX_LEN, dtype='float16')
y = data['priority']
del data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_rest, y_train, y_rest =  train_test_split(X, y, test_size=0.3, random_state=3)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=3)

# Modelos

In [None]:
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding, Lambda, LSTM, Flatten, Dense, Input, Dropout, Bidirectional, GlobalMaxPooling1D
from keras.optimizers import Adam, RMSprop, SGD
from kerastuner import RandomSearch, HyperParameters
from keras.layers import Conv1D, MaxPooling1D

def print_hyperparameters(json_data):
    """
    Imprime los hiperparámetros contenidos en un objeto JSON.

    :param json_data: Objeto JSON que contiene los hiperparámetros.
    :type json_data: dict
    """
    values = json_data["values"]

    for key, value in sorted(values.items()):
        print(f"{key}: {value}")


In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-5)
cp = ModelCheckpoint('saved/', save_best_only=True)

callbacks = [cp, early_stopping, reduce_lr]

## GloVe-LSTM

In [None]:
def build_model(hp):

    mirrored_strategy = tf.distribute.MirroredStrategy()

    with mirrored_strategy.scope():
    
        model = Sequential()
        model.add(Input(shape=(MAX_LEN, 50)))

        # Hiperparámetros para LSTM 1
        lstm_units = hp.Int("lstm_units_1", min_value=128, max_value=256, step=32)
        lstm_dropout = hp.Float("lstm_dropout", min_value=0.1, max_value=0.5, step=0.1)

        model.add(Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=lstm_dropout, recurrent_dropout=lstm_dropout)))

        # Hiperparámetros para LSTM 2
        lstm_units = hp.Int("lstm_units_2", min_value=64, max_value=128, step=32)
        lstm_dropout = hp.Float("lstm_dropout_2", min_value=0.1, max_value=0.5, step=0.1)

        model.add(Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=lstm_dropout, recurrent_dropout=lstm_dropout)))
        model.add(GlobalMaxPooling1D())

        # Hiperparámetros para capa densa 1
        dense_units = hp.Int("dense_units_1", min_value=32, max_value=128, step=32)
        dense_dropout = hp.Float("dense_dropout_1", min_value=0.1, max_value=0.5, step=0.1)

        model.add(Dense(dense_units, activation='relu'))
        model.add(Dropout(dense_dropout))

        # Hiperparámetros para capa densa 2
        dense_units = hp.Int("dense_units_2", min_value=32, max_value=128, step=32)
        model.add(Dense(dense_units, activation='relu'))

        # Salida del modelo
        model.add(Dense(7, activation='softmax'))

        # Hiperparámetros para el optimizador (En otras pruebas se vio que Adam era el mejor)
        learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-3, sampling="LOG")
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define el objeto de búsqueda aleatoria
tuner = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=20,  # Número de modelos a probar
    executions_per_trial=1,
    directory='./saved/fine_tuned/',
    project_name='HP_LSTM_Glove_text'
)

# Resumen de la búsqueda
tuner.search_space_summary()

In [None]:
BATCH_SIZE=1024
tuner.search(X_train, y_train,
                    epochs=10,
                    validation_split=0.1,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)



In [None]:
best_hp_gloveLstm = tuner.get_best_hyperparameters()[0]

print("Mejores hiperparámetros encontrados:")
print(best_hp_gloveLstm.get_config())


In [None]:
# from keras.models import Sequential
# from keras.layers import Embedding, LSTM, Flatten, Dense, Input, Dropout, Bidirectional, GlobalMaxPooling1D

# model = Sequential()
# model.add(Input(shape=(MAX_LEN, 50)))
# model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.25, recurrent_dropout=0.25)))
# model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.25, recurrent_dropout=0.25)))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(7, activation='softmax'))

## Test

In [None]:
from keras.models import load_model

lstm_basic_model = load_model('saved/')

In [None]:
score = lstm_basic_model.evaluate(X_test, y_test, verbose=1)

In [None]:
print('Test Accuracy:', score[1])

## GloVe-LSTM-DO

In [None]:
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    lstm_dropout_dense = Sequential(name='Lstm-dout-dense')
    lstm_dropout_dense.add(Input(shape=(MAX_LEN, 50)))
    lstm_dropout_dense.add(LSTM(64, return_sequences=True))
    lstm_dropout_dense.add(Dropout(0.2))
    lstm_dropout_dense.add(LSTM(32))
    lstm_dropout_dense.add(Flatten())
    lstm_dropout_dense.add(Dense(128, activation='relu'))
    lstm_dropout_dense.add(Dense(7, activation='softmax'))

lstm_dropout_dense.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
BATCH_SIZE=1024
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_dropout_dense.fit(X, y, epochs=10, batch_size=BATCH_SIZE, callbacks=callbacks, validation_data=(X_val, y_val), validation_batch_size=BATCH_SIZE)

In [None]:
from keras.models import Sequential
from keras.layers import Input, LSTM, Dropout, Flatten, Dense
from keras.optimizers import Adam, RMSprop, SGD
from kerastuner import RandomSearch, HyperParameters

def build_model_gloveLstmDo(hp):
    """
    Construye un modelo secuencial con capas LSTM, Dropout y Densa, con hiperparámetros ajustables.

    :param hp: Objeto HyperParameters que contiene los hiperparámetros de la búsqueda.
    :type hp: tensorflow.keras.optimizers.HyperParameters
    :return: Modelo construido.
    :rtype: tensorflow.keras.models.Sequential
    """
    model = Sequential(name='GloVe-LSTM-DO')
    model.add(Input(shape=(MAX_LEN, 50)))

    # Hiperparámetros para la primera capa LSTM
    lstm_units_1 = hp.Int("lstm_units_1", min_value=32, max_value=128, step=32)
    model.add(LSTM(lstm_units_1, return_sequences=True))

    # Hiperparámetros para Dropout
    dropout_rate = hp.Float("dropout_rate", min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(dropout_rate))

    # Hiperparámetros para la segunda capa LSTM
    lstm_units_2 = hp.Int("lstm_units_2", min_value=16, max_value=64, step=16)
    model.add(LSTM(lstm_units_2))

    model.add(Flatten())

    # Hiperparámetros para la capa Densa
    dense_units = hp.Int("dense_units", min_value=32, max_value=256, step=32)
    model.add(Dense(dense_units, activation='relu'))

    model.add(Dense(7, activation='softmax'))

    # Hiperparámetros para el optimizador
    learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-3, sampling="LOG")
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

gloveLstmDoTuner = RandomSearch(
    build_model_gloveLstmDo,
    objective="val_accuracy",
    max_trials=20,  # Número de modelos a probar
    executions_per_trial=1,
    directory='./saved/fine_tuned/',
    project_name='HP_LSTM_dropout_dense'
)

# Resumen de la búsqueda
gloveLstmDoTuner.search_space_summary()


In [None]:
# Realiza la búsqueda
gloveLstmDoTuner.search(X_train, y_train,
             epochs=10,
             validation_split=0.1,
             batch_size=4096,
             callbacks=callbacks)

best_hp_gloveLstmDo = gloveLstmDoTuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# lstm_dropout_dense.summary()

In [None]:
# lstm_dropout_dense.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['acc'])

In [None]:
# early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
# history_2 = lstm_dropout_dense.fit(X_train, y_train,
#                     validation_split=0.2, epochs=10,
#                     batch_size=BATCH_SIZE,
#                     callbacks=[early_stop])

In [None]:
# score = lstm_dropout_dense.evaluate(X_test, y_test, verbose=1)
# print('Test Accuracy:', score[1])

# 3LSTM-DO-CNN-Dense

In [None]:
# from keras.layers import Conv1D, MaxPooling1D
# from keras.models import Sequential
# from keras.layers import Input, LSTM, Dropout, Flatten, Dense
# from keras.optimizers import Adam, RMSprop, SGD
# from kerastuner import RandomSearch, HyperParameters

# lstm3_do_cnn_dense = Sequential(name='3LSTM-DO-CNN-Dense')
# lstm3_do_cnn_dense.add(Input(shape=(MAX_LEN, 50)))

# #LSTM
# lstm3_do_cnn_dense.add(LSTM(256, name='LSTM1', return_sequences=True))
# lstm3_do_cnn_dense.add(Dropout(0.2, name='DO1'))

# lstm3_do_cnn_dense.add(LSTM(128, name='LSTM2', return_sequences=True))
# lstm3_do_cnn_dense.add(Dropout(0.2, name='DO2'))

# lstm3_do_cnn_dense.add(LSTM(64, name='LSTM3', return_sequences=True))
# lstm3_do_cnn_dense.add(Dropout(0.2, name='DO3'))

# #CNN
# lstm3_do_cnn_dense.add(Conv1D(128, kernel_size=3, strides=1, padding='same', activation='relu'))
# lstm3_do_cnn_dense.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))
# lstm3_do_cnn_dense.add(Dropout(0.2))
# lstm3_do_cnn_dense.add(Flatten(name='F1'))

# #Fully connected
# lstm3_do_cnn_dense.add(Dense(64, activation='relu'))
# lstm3_do_cnn_dense.add(Dense(7, activation='softmax'))

# lstm3_do_cnn_dense.summary()

In [None]:
def build_model_gloveLstmDoCnn(hp):
    model = Sequential(name='3LSTM-DO-CNN-Dense')
    model.add(Input(shape=(MAX_LEN, 50)))

     """
    Construye un modelo secuencial con capas LSTM, Dropout, Conv1D y Densa, con hiperparámetros ajustables.

    :param hp: Objeto HyperParameters que contiene los hiperparámetros de la búsqueda.
    :type hp: tensorflow.keras.optimizers.HyperParameters
    :return: Modelo construido.
    :rtype: tensorflow.keras.models.Sequential
    """
        
    # Hiperparámetros para la primera capa LSTM
    lstm_units_1 = hp.Int("lstm_units_1", min_value=128, max_value=512, step=32)
    model.add(LSTM(lstm_units_1, name='LSTM1', return_sequences=True))
    lstm_dropout_1 = hp.Float("lstm_dropout_1", min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(lstm_dropout_1, name='DO1'))

    # Hiperparámetros para la segunda capa LSTM
    lstm_units_2 = hp.Int("lstm_units_2", min_value=64, max_value=256, step=32)
    model.add(LSTM(lstm_units_2, name='LSTM2', return_sequences=True))
    lstm_dropout_2 = hp.Float("lstm_dropout_2", min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(lstm_dropout_2, name='DO2'))

    # Hiperparámetros para la tercera capa LSTM
    lstm_units_3 = hp.Int("lstm_units_3", min_value=32, max_value=128, step=16)
    model.add(LSTM(lstm_units_3, name='LSTM3', return_sequences=True))
    lstm_dropout_3 = hp.Float("lstm_dropout_3", min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(lstm_dropout_3, name='DO3'))

    # Hiperparámetros para la capa Conv1D
    conv_filters = hp.Int("conv_filters", min_value=32, max_value=256, step=32)
    model.add(Conv1D(conv_filters, kernel_size=3, strides=1, padding='same', activation='relu'))
    
    model.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))

    cnn_dropout = hp.Float("cnn_dropout", min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(cnn_dropout))
    
    model.add(Flatten(name='F1'))

    # Hiperparámetros para la capa Densa
    dense_units = hp.Int("dense_units", min_value=32, max_value=256, step=32)
    model.add(Dense(dense_units, activation='relu'))
    
    model.add(Dense(7, activation='softmax'))

    # Hiperparámetros para el optimizador
    learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-3, sampling="LOG")
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
tuner = RandomSearch(
    build_model_gloveLstmDoCnn,
    objective="val_accuracy",
    max_trials=20,  # Número de modelos a probar
    executions_per_trial=1,
    directory='./saved/fine_tuned/',
    project_name='HP_3LSTM-DO-CNN-Dense'
)

# Resumen de la búsqueda
tuner.search_space_summary()

In [None]:
BATCH_SIZE=4096
tuner.search(X_train, y_train,
                    epochs=10,
                    validation_split=0.1,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)

best_hp_gloveLstmDoCnn = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Mejores hiperparámetros encontrados:")
print_hyperparameters(best_hp_gloveLstmDoCnn.get_config())

# Test

In [None]:
print("MODELO 1")
print_hyperparameters(best_hp_gloveLstm.get_config())
print(" ==================================================== ")
print("MODELO 2")
print_hyperparameters(best_hp_gloveLstmDo.get_config())
print(" ==================================================== ")
print("MODELO 3")
print_hyperparameters(best_hp_gloveLstmDoCnn.get_config())

In [None]:
def best_model_1(hyper_parameters):
    model = Sequential()
    model.add(Input(shape=(MAX_LEN, 50)))
    
    model.add(Bidirectional(LSTM(hyper_parameters.get('lstm_units_1'), 
                                 return_sequences=True, 
                                 dropout=hyper_parameters.get('lstm_dropout'), 
                                 recurrent_dropout=hyper_parameters.get('lstm_dropout'))))
    
    model.add(Bidirectional(LSTM(hyper_parameters.get('lstm_units_2'), 
                                 return_sequences=True, 
                                 dropout=hyper_parameters.get('lstm_dropout_2'), 
                                 recurrent_dropout=hyper_parameters.get('lstm_dropout_2'))))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hyper_parameters.get('dense_units_1'), activation='relu'))
    model.add(Dropout(hyper_parameters.get('dense_dropout_1')))
    model.add(Dense(hyper_parameters.get('dense_units_2'), activation='relu'))
    model.add(Dense(7, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
best_model_gloveLstm = best_model_1(best_hp_gloveLstm)
history_gloveLstm = best_model_gloveLstm.fit(X_train, y_train,
             epochs=30,
             validation_split=0.2,
             batch_size=1024,
             callbacks=callbacks)

In [None]:
def best_model_2(hp):
    model = Sequential(name='GloVe-LSTM-DO')
    model.add(Input(shape=(MAX_LEN, 50)))
    
      """
    Construye un modelo secuencial con capas LSTM, Dropout y Densa, utilizando los mejores hiperparámetros encontrados.

    :param hp: Objeto HyperParameters que contiene los mejores hiperparámetros encontrados.
    :type hp: dict
    :return: Modelo construido.
    :rtype: tensorflow.keras.models.Sequential
    """
    
    # Hiperparámetros para la primera capa LSTM
    lstm_units_1 = hp.get("lstm_units_1")
    model.add(LSTM(lstm_units_1, return_sequences=True))
    
    # Hiperparámetros para Dropout
    dropout_rate = hp.get("dropout_rate")
    model.add(Dropout(dropout_rate))
    
    # Hiperparámetros para la segunda capa LSTM
    lstm_units_2 = hp.get("lstm_units_2")
    model.add(LSTM(lstm_units_2))
    
    model.add(Flatten())
    
    # Hiperparámetros para la capa Densa
    dense_units = hp.get("dense_units")
    model.add(Dense(dense_units, activation='relu'))
    
    model.add(Dense(7, activation='softmax'))

    # Hiperparámetros para el optimizador
    learning_rate = hp.get("learning_rate")
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
best_model_gloveLstmDo = best_model_2(best_hp_gloveLstmDo)
history_gloveLstmDo = best_model_gloveLstmDo.fit(X_train, y_train,
             epochs=30,
             validation_split=0.2,
             batch_size=4093,
             callbacks=callbacks)

In [None]:
def best_model_3(hp):
    model = Sequential(name='3LSTM-DO-CNN-Dense')
    model.add(Input(shape=(MAX_LEN, 50)))

    
    """
    Construye un modelo secuencial con capas LSTM, Dropout, Conv1D y Densa, utilizando los mejores hiperparámetros encontrados.

    :param hp: Objeto HyperParameters que contiene los mejores hiperparámetros encontrados.
    :type hp: dict
    :return: Modelo construido.
    :rtype: tensorflow.keras.models.Sequential
    """
    
    # Hiperparámetros para la primera capa LSTM
    lstm_units_1 = hp.get("lstm_units_1")
    model.add(LSTM(lstm_units_1, name='LSTM1', return_sequences=True))
    lstm_dropout_1 = hp.get("lstm_dropout_1")
    model.add(Dropout(lstm_dropout_1, name='DO1'))

    # Hiperparámetros para la segunda capa LSTM
    lstm_units_2 = hp.get("lstm_units_2")
    model.add(LSTM(lstm_units_2, name='LSTM2', return_sequences=True))
    lstm_dropout_2 = hp.get("lstm_dropout_2")
    model.add(Dropout(lstm_dropout_2, name='DO2'))

    # Hiperparámetros para la tercera capa LSTM
    lstm_units_3 = hp.get("lstm_units_3")
    model.add(LSTM(lstm_units_3, name='LSTM3', return_sequences=True))
    lstm_dropout_3 = hp.get("lstm_dropout_3")
    model.add(Dropout(lstm_dropout_3, name='DO3'))

    # Hiperparámetros para la capa Conv1D
    conv_filters = hp.get("conv_filters")
    model.add(Conv1D(conv_filters, kernel_size=3, strides=1, padding='same', activation='relu'))
    
    model.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))

    cnn_dropout = hp.get("cnn_dropout")
    model.add(Dropout(cnn_dropout))
    
    model.add(Flatten(name='F1'))

    # Hiperparámetros para la capa Densa
    dense_units = hp.get("dense_units")
    model.add(Dense(dense_units, activation='relu'))
    
    model.add(Dense(7, activation='softmax'))

    # Hiperparámetros para el optimizador
    learning_rate = hp.get("learning_rate")
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
best_model_gloveLstmDoCnn = best_model_3(best_hp_gloveLstmDoCnn)
history_gloveLstmDoCnn = best_model_gloveLstmDoCnn.fit(X_train, y_train,
             epochs=30,
             validation_split=0.2,
             batch_size=4093,
             callbacks=callbacks)

### Test

In [None]:
print("Glove LSTM")
scoreGloveLstm = best_model_gloveLstm.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy:', scoreGloveLstm[1])

In [None]:
print("Glove LSTM Do")
scoreGloveLstmDo = best_model_gloveLstmDo.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy:', scoreGloveLstmDo[1])

In [None]:
print("Glove LSTM Do Cnn")
scoreGloveLstmDoCnn = best_model_gloveLstmDoCnn.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy:', scoreGloveLstmDoCnn[1])

### Summary

In [None]:
print("Glove LSTM")
best_model_gloveLstm.save('./saved/GloveLSTM.h5')
best_model_gloveLstm.summary()

In [None]:
print("Glove LSTM Do")
best_model_gloveLstmDo.save('./saved/GloveLSTMDo.h5')
best_model_gloveLstmDo.summary()

In [None]:
print("Glove LSTM Do Cnn")
best_model_gloveLstmDoCnn.save('./saved/GloveLSTMDoCNN.h5')
best_model_gloveLstmDoCnn.summary()

### Graficas

#### Val vs Loss

In [None]:
from matplotlib import pyplot as plt

def plot_val_vs_loss(history):
    
    """
    Grafica las curvas de pérdida y precisión (accuracy) durante el entrenamiento y la validación.

    :param history: Historial del entrenamiento que contiene la información de pérdida y precisión.
    :type history: tensorflow.python.keras.callbacks.History
    """
    
    num_epochs = len(history.epoch)

    epochs = [i for i in range(num_epochs)]
    fig , ax = plt.subplots(1,2)
    train_acc = history.history['accuracy']
    train_loss = history.history['loss']
    test_acc = history.history['val_accuracy']
    test_loss = history.history['val_loss']

    fig.set_size_inches(20,6)
    ax[0].plot(epochs , train_loss , label = 'Training Loss')
    ax[0].plot(epochs , test_loss , label = 'Testing Loss')
    ax[0].set_title('Training & Testing Loss')
    ax[0].legend()
    ax[0].set_xlabel("Epochs")

    ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
    ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
    ax[1].set_title('Training & Testing Accuracy')
    ax[1].legend()
    ax[1].set_xlabel("Epochs")
    plt.show()

In [None]:
plot_val_vs_loss(history_gloveLstm)

In [None]:
plot_val_vs_loss(history_gloveLstmDo)

In [None]:
plot_val_vs_loss(history_gloveLstmDoCnn)

In [None]:
y_pred_GloveLstm = best_model_gloveLstm.predict(X_test)
y_pred_GloveLstmDo = best_model_gloveLstmDo.predict(X_test)
y_pred_GloveLstmDoCnn = best_model_gloveLstmDoCnn.predict(X_test)

## Metricas de regresion

### Modelo Glove LSTM:

In [None]:
# SSE 
sse_gloveLstm = np.sum((y_pred_GloveLstm - y_test) ** 2)

# SSR 
ssr_gloveLstm = np.sum((y_pred_GloveLstm - np.mean(y_test)) ** 2)

# SSTO 
ssto_gloveLstm = np.sum((y_test - np.mean(y_test)) ** 2)

# MAE 
mae_gloveLstm = np.mean(np.abs(y_pred_GloveLstm - y_test))

# MSE 
mse_gloveLstm = np.mean((y_pred_GloveLstm - y_test) ** 2)

# RMSE 
rmse_gloveLstm = np.sqrt(mse_gloveLstm)

# RMSLE 
rmsle_gloveLstm = np.sqrt(np.mean((np.log1p(y_pred_GloveLstm) - np.log1p(y_test)) ** 2))

# R2 
r2_gloveLstm = 1 - (sse_gloveLstm / ssto_gloveLstm)

# R2 Ajustado 
n_gloveLstm = len(y_test)
p_gloveLstm = len(y_pred_GloveLstm)
adjusted_r2_gloveLstm = 1 - (1 - r2_gloveLstm) * (n_gloveLstm - 1) / (n_gloveLstm - p_gloveLstm - 1)

# Varianza de pred
variance_pred_gloveLstm = np.var(y_pred_GloveLstm)

# Varianza de target
variance_target_gloveLstm = np.var(y_test)

# Cálculo del AIC 
n_samples_gloveLstm = len(y_test)
residuals_gloveLstm = y_pred_GloveLstm - y_test
rss_gloveLstm = np.sum(residuals_gloveLstm ** 2)
k_gloveLstm = p_gloveLstm + 1  # número de parámetros incluyendo el término constante
aic_gloveLstm = n_samples_gloveLstm * np.log(rss_gloveLstm / n_samples_gloveLstm) + 2 * k_gloveLstm

# Cálculo del BIC 
bic_gloveLstm = n_samples_gloveLstm * np.log(rss_gloveLstm / n_samples_gloveLstm) + k_gloveLstm * np.log(n_samples_gloveLstm)

print("SSE - Glove LSTM:", sse_gloveLstm)
print("SSR - Glove LSTM:", ssr_gloveLstm)
print("SSTO - Glove LSTM:", ssto_gloveLstm)
print("MAE - Glove LSTM:", mae_gloveLstm)
print("MSE - Glove LSTM:", mse_gloveLstm)
print("RMSE - Glove LSTM:", rmse_gloveLstm)
print("RMSLE - Glove LSTM:", rmsle_gloveLstm)
print("R2 - Glove LSTM:", r2_gloveLstm)
print("Adjusted R2 - Glove LSTM:", adjusted_r2_gloveLstm)
print("Varianza Predicciones - Glove LSTM:", variance_pred_gloveLstm)
print("Varianza Objetivo - Glove LSTM:", variance_target_gloveLstm)
print("AIC - Glove LSTM:", aic_gloveLstm)
print("BIC - Glove LSTM:", bic_gloveLstm)


### Modelo GloveLstmDO

In [None]:
# SSE 
sse_gloveLstmDo = np.sum((y_pred_GloveLstmDo - y_test) ** 2)

# SSR 
ssr_gloveLstmDo = np.sum((y_pred_GloveLstmDo - np.mean(y_test)) ** 2)

# SSTO 
ssto_gloveLstmDo = np.sum((y_test - np.mean(y_test)) ** 2)

# MAE 
mae_gloveLstmDo = np.mean(np.abs(y_pred_GloveLstmDo - y_test))

# MSE 
mse_gloveLstmDo = np.mean((y_pred_GloveLstmDo - y_test) ** 2)

# RMSE 
rmse_gloveLstmDo = np.sqrt(mse_gloveLstmDo)

# RMSLE 
rmsle_gloveLstmDo = np.sqrt(np.mean((np.log1p(y_pred_GloveLstmDo) - np.log1p(y_test)) ** 2))

# R2 
r2_gloveLstmDo = 1 - (sse_gloveLstmDo / ssto_gloveLstmDo)

# R2 Ajustado 
n_gloveLstmDo = len(y_test)
p_gloveLstmDo = len(y_pred_GloveLstmDo)
adjusted_r2_gloveLstmDo = 1 - (1 - r2_gloveLstmDo) * (n_gloveLstmDo - 1) / (n_gloveLstmDo - p_gloveLstmDo - 1)

# Varianza de pred
variance_pred_gloveLstmDo = np.var(y_pred_GloveLstmDo)

# Varianza de target
variance_target_gloveLstmDo = np.var(y_test)

# Cálculo del AIC 
n_samples_gloveLstmDo = len(y_test)
residuals_gloveLstmDo = y_pred_GloveLstmDo - y_test
rss_gloveLstmDo = np.sum(residuals_gloveLstmDo ** 2)
k_gloveLstmDo = p_gloveLstmDo + 1  # número de parámetros incluyendo el término constante
aic_gloveLstmDo = n_samples_gloveLstmDo * np.log(rss_gloveLstmDo / n_samples_gloveLstmDo) + 2 * k_gloveLstmDo

# Cálculo del BIC 
bic_gloveLstmDo = n_samples_gloveLstmDo * np.log(rss_gloveLstmDo / n_samples_gloveLstmDo) + k_gloveLstmDo * np.log(n_samples_gloveLstmDo)

print("SSE - Glove LSTM Do:", sse_gloveLstmDo)
print("SSR - Glove LSTM Do:", ssr_gloveLstmDo)
print("SSTO - Glove LSTM Do:", ssto_gloveLstmDo)
print("MAE - Glove LSTM Do:", mae_gloveLstmDo)
print("MSE - Glove LSTM Do:", mse_gloveLstmDo)
print("RMSE - Glove LSTM Do:", rmse_gloveLstmDo)
print("RMSLE - Glove LSTM Do:", rmsle_gloveLstmDo)
print("R2 - Glove LSTM Do:", r2_gloveLstmDo)
print("Adjusted R2 - Glove LSTM Do:", adjusted_r2_gloveLstmDo)
print("Varianza Predicciones - Glove LSTM Do:", variance_pred_gloveLstmDo)
print("Varianza Objetivo - Glove LSTM Do:", variance_target_gloveLstmDo)
print("AIC - Glove LSTM Do:", aic_gloveLstmDo)
print("BIC - Glove LSTM Do:", bic_gloveLstmDo)


### Modelo GloveLstmDoCnn

In [None]:
# SSE 
sse_gloveLstmDoCnn = np.sum((y_pred_GloveLstmDoCnn - y_test) ** 2)

# SSR 
ssr_gloveLstmDoCnn = np.sum((y_pred_GloveLstmDoCnn - np.mean(y_test)) ** 2)

# SSTO 
ssto_gloveLstmDoCnn = np.sum((y_test - np.mean(y_test)) ** 2)

# MAE
mae_gloveLstmDoCnn = np.mean(np.abs(y_pred_GloveLstmDoCnn - y_test))

# MSE
mse_gloveLstmDoCnn = np.mean((y_pred_GloveLstmDoCnn - y_test) ** 2)

# RMSE 
rmse_gloveLstmDoCnn = np.sqrt(mse_gloveLstmDoCnn)

# RMSLE 
rmsle_gloveLstmDoCnn = np.sqrt(np.mean((np.log1p(y_pred_GloveLstmDoCnn) - np.log1p(y_test)) ** 2))

# R2 
r2_gloveLstmDoCnn = 1 - (sse_gloveLstmDoCnn / ssto_gloveLstmDoCnn)

# R2 Ajustado 
n_gloveLstmDoCnn = len(y_test)
p_gloveLstmDoCnn = len(y_pred_GloveLstmDoCnn)
adjusted_r2_gloveLstmDoCnn = 1 - (1 - r2_gloveLstmDoCnn) * (n_gloveLstmDoCnn - 1) / (n_gloveLstmDoCnn - p_gloveLstmDoCnn - 1)

# Varianza de pred
variance_pred_gloveLstmDoCnn = np.var(y_pred_GloveLstmDoCnn)

# Varianza de target
variance_target_gloveLstmDoCnn = np.var(y_test)

# Cálculo del AIC 
n_samples_gloveLstmDoCnn = len(y_test)
residuals_gloveLstmDoCnn = y_pred_GloveLstmDoCnn - y_test
rss_gloveLstmDoCnn = np.sum(residuals_gloveLstmDoCnn ** 2)
k_gloveLstmDoCnn = p_gloveLstmDoCnn + 1  # número de parámetros incluyendo el término constante
aic_gloveLstmDoCnn = n_samples_gloveLstmDoCnn * np.log(rss_gloveLstmDoCnn / n_samples_gloveLstmDoCnn) + 2 * k_gloveLstmDoCnn

# Cálculo del BIC 
bic_gloveLstmDoCnn = n_samples_gloveLstmDoCnn * np.log(rss_gloveLstmDoCnn / n_samples_gloveLstmDoCnn) + k_gloveLstmDoCnn * np.log(n_samples_gloveLstmDoCnn)

print("SSE - Glove LSTM Do Cnn:", sse_gloveLstmDoCnn)
print("SSR - Glove LSTM Do Cnn:", ssr_gloveLstmDoCnn)
print("SSTO - Glove LSTM Do Cnn:", ssto_gloveLstmDoCnn)
print("MAE - Glove LSTM Do Cnn:", mae_gloveLstmDoCnn)
print("MSE - Glove LSTM Do Cnn:", mse_gloveLstmDoCnn)
print("RMSE - Glove LSTM Do Cnn:", rmse_gloveLstmDoCnn)
print("RMSLE - Glove LSTM Do Cnn:", rmsle_gloveLstmDoCnn)
print("R2 - Glove LSTM Do Cnn:", r2_gloveLstmDoCnn)
print("Adjusted R2 - Glove LSTM Do Cnn:", adjusted_r2_gloveLstmDoCnn)
print("Varianza Predicciones - Glove LSTM Do Cnn:", variance_pred_gloveLstmDoCnn)
print("Varianza Objetivo - Glove LSTM Do Cnn:", variance_target_gloveLstmDoCnn)
print("AIC - Glove LSTM Do Cnn:", aic_gloveLstmDoCnn)
print("BIC - Glove LSTM Do Cnn:", bic_gloveLstmDoCnn)


## Heatmap

In [None]:
y_pred_GloveLstm[0]

In [None]:
y_test_np =  np.array(y_test)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_heatmap(y_pred, y_test, labels):
    
     """
    Grafica un heatmap de la matriz de confusión.

    :param y_pred: Predicciones del modelo.
    :type y_pred: numpy.ndarray
    :param y_test: Etiquetas verdaderas.
    :type y_test: numpy.ndarray
    :param labels: Etiquetas de las clases.
    :type labels: list[str]
    """
        
    # Realizar predicciones
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Calcular la matriz de confusión
    cm = confusion_matrix(y_true_classes, y_pred_classes)

    # Crear el heatmap utilizando Seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm', cbar=False, xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

labels = ['Clase 1', 'Clase 2', 'Clase 3', 'Clase 4', 'Clase 5', 'Clase 6', 'Clase 7']
plot_heatmap(y_pred_GloveLstm, y_test_np, labels)

In [None]:
plot_heatmap(y_pred_GloveLstmDo, y_test_np, labels)

In [None]:
plot_heatmap(y_pred_GloveLstmDoCnn, y_test_np, labels)