In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from gensim.models import Word2Vec

2023-05-01 16:57:12.804485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-01 16:57:13.368893: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/yonosoysantiago/miniconda3/envs/tf211_conda/lib/:/home/yonosoysantiago/miniconda3/envs/tf211_conda/lib/
2023-05-01 16:57:13.368951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/yonosoysanti

In [2]:
# Load the dataset
df = pd.read_csv("../../data/cleaned/out.csv")

# Text preprocessing
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

df['text'] = df['text'].apply(preprocess_text)
df['text'].head()

0     tiffanylue i know i was listenin to bad habit...
1    layin n bed with a headache ughhhh waitin on y...
2                      funeral ceremony gloomy friday 
3                 wants to hang out with friends soon 
4     dannycastillo we want to trade with someone w...
Name: text, dtype: object

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['text'] = df['text'].apply(remove_stopwords)
df['text'].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yonosoysantiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    tiffanylue know listenin bad habit earlier sta...
1              layin n bed headache ughhhh waitin call
2                       funeral ceremony gloomy friday
3                              wants hang friends soon
4    dannycastillo want trade someone houston ticke...
Name: text, dtype: object

In [4]:
tokenized_texts = [text.split() for text in df['text']]
word2vec = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1)

In [5]:
def text_to_vector(text):
    words = text.split()
    word_vectors = [word2vec.wv[word] for word in words if word in word2vec.wv]
    if not word_vectors:
        return np.zeros(word2vec.vector_size).tolist()
    return np.mean(word_vectors, axis=0).tolist()


df['vector'] = df['text'].apply(text_to_vector)

In [6]:
encoder = OneHotEncoder(sparse=False)
encoded_labels = encoder.fit_transform(df['label'].values.reshape(-1, 1))




In [7]:
encoded_labels.shape

(65989, 7)

In [8]:
encoded_labels[0]

array([1., 0., 0., 0., 0., 0., 0.])

In [9]:
df.head()

Unnamed: 0,label,text,vector
0,0,tiffanylue know listenin bad habit earlier sta...,"[-0.0985688716173172, 0.45782729983329773, -0...."
1,0,layin n bed headache ughhhh waitin call,"[-0.27400386333465576, 0.4046027362346649, -0...."
2,0,funeral ceremony gloomy friday,"[-0.17208629846572876, 0.18125255405902863, -0..."
3,1,wants hang friends soon,"[-0.37086427211761475, 0.8183702230453491, -0...."
4,6,dannycastillo want trade someone houston ticke...,"[-0.18290771543979645, 0.665810227394104, -0.1..."


In [10]:
def convert_to_tensor(arg):
    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
    return arg

In [11]:
X = np.array([np.array(vec) for vec in df['vector'].to_numpy()], dtype=np.float32)
y = encoded_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
type(X_train[0])

numpy.ndarray

In [13]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (52791, 100)
y_train shape: (52791, 7)


In [14]:
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding, Lambda, LSTM, Flatten, Dense, Input, Dropout, Bidirectional, GlobalMaxPooling1D
from keras.optimizers import Adam, RMSprop, SGD
from kerastuner import RandomSearch, HyperParameters

def build_model(hp):

    # Model parameters
    input_dim = 100  # Word2Vec vector size
    output_dim = 7   # Number of sentiment labels

    # # Build the LSTM model
    # model = Sequential()
    # model.add(Embedding(input_dim=input_dim+1, output_dim=input_dim, input_length=input_dim))
    # model.add(LSTM(hidden_units))
    # model.add(Dense(output_dim, activation='softmax'))
    model = Sequential()

    # Embedding
    model.add(Embedding(input_dim=input_dim+1, output_dim=output_dim, input_length=input_dim))

    # Hiperparámetros para LSTM
    lstm_units = hp.Int("lstm_units", min_value=32, max_value=256, step=32)
    lstm_dropout = hp.Float("lstm_dropout", min_value=0.1, max_value=0.5, step=0.1)
    
    model.add(Bidirectional(LSTM(lstm_units, dropout=lstm_dropout)))

    # Salida del modelo
    model.add(Dense(7, activation='softmax'))

    # Hiperparámetros para el optimizador (En otras pruebas se vio que Adam era el mejor)
    learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-3, sampling="LOG")
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define el objeto de búsqueda aleatoria
tuner = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=20,  # Número de modelos a probar
    executions_per_trial=1,
    directory='./saved/fine_tuned/',
    project_name='HP_LSTM-Word2Vec-OHE'
)

# Resumen de la búsqueda
tuner.search_space_summary()

  from kerastuner import RandomSearch, HyperParameters
2023-05-01 16:57:19.958443: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 16:57:19.958648: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 16:57:19.962580: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 16:57:19.962777: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 16

Search space summary
Default search space size: 3
lstm_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': 'linear'}
lstm_dropout (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
learning_rate (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 0.001, 'step': None, 'sampling': 'log'}


In [15]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-5)
cp = ModelCheckpoint('saved/', save_best_only=True)

callbacks = [cp, early_stopping, reduce_lr]

In [16]:
BATCH_SIZE=4096
tuner.search(X_train, y_train,
                    epochs=10,
                    validation_split=0.1,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)

best_hp_random = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Mejores hiperparámetros encontrados:")
print(best_hp_random)

Trial 8 Complete [00h 01m 25s]
val_accuracy: 0.2140151560306549

Best val_accuracy So Far: 0.24715909361839294
Total elapsed time: 00h 09m 47s

Search: Running Trial #9

Value             |Best Value So Far |Hyperparameter
224               |64                |lstm_units
0.5               |0.1               |lstm_dropout
1.645e-05         |2.21e-05          |learning_rate

Epoch 1/10








INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 2/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 3/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 4/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 5/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 6/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 7/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 8/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 9/10



INFO:tensorflow:Assets written to: saved/assets


INFO:tensorflow:Assets written to: saved/assets


Epoch 10/10

KeyboardInterrupt: 

In [None]:
# Model parameters
input_dim = 100  # Word2Vec vector size
output_dim = 7   # Number of sentiment labels
hidden_units = 32  # LSTM hidden units

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=input_dim+1, output_dim=input_dim, input_length=input_dim))
model.add(LSTM(hidden_units))
model.add(Dense(output_dim, activation='softmax'))
model.summary()
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=4096, validation_split=0.2)