In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import sys
import warnings
warnings.filterwarnings("ignore")
import os
import joblib
sys.path.append(os.path.abspath('../src'))
from data_tokenizer import procesar_texto
import data_loader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, recall_score, precision_score
from keras.layers import Dense, LSTM
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def division_datos(tfidf, mensajes=True):
    '''
    Carga y segmenta los datos en entrenamiento, validacion y prueba dado el tokenizador escogido.
    Guarda el scaler para normalizar datos en la prediccion.
    Argumentos:
        * tfidf: True para usar TF-IDF, False para usar CountVectorizer
        * mensajes: Por default es True. Indica si se desea imprimir un diagnostico de cantidad de filas y primeros registros de las bases finales

    Retorno:
        * x_train, x_test, x_val
        * y_train, y_test, y_val
    '''
    # Cargando datos de acuerdo a tokenizador seleccionado
    if tfidf:
        vectorizador = joblib.load('C:/Users/gerb2/Documents/DEEPLEARNING/taller2_tweets/Modelo_Sentimientos/models/vectorizador_tfidf.pkl')
        x = joblib.load('C:/Users/gerb2/Documents/DEEPLEARNING/taller2_tweets/Modelo_Sentimientos/models/tweets_tfidf.pkl')
    else:
        vectorizador = joblib.load('C:/Users/gerb2/Documents/DEEPLEARNING/taller2_tweets/Modelo_Sentimientos/models/vectorizador_tf.pkl')
        x = joblib.load('C:/Users/gerb2/Documents/DEEPLEARNING/taller2_tweets/Modelo_Sentimientos/models/tweets_tf.pkl')
    
    y = joblib.load('C:/Users/gerb2/Documents/DEEPLEARNING/taller2_tweets/Modelo_Sentimientos/models/labels.pkl')
    
    # Division en train, test y validacion
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.10, random_state=42)

    if mensajes:
        print("Dimensiones de X completa:", x.shape)
        print("Dimensiones de X train:", x_train.shape)
        print("Dimensiones de X test:", x_test.shape)

        print("\nPrimeros registros X test:")
        print(x_test.toarray()[:5])

        print("\nPrimeras 5 etiquetas")
        print(y[:5])
    
    return x_train, x_test, x_val, y_train, y_test, y_val

In [None]:
# Definir parámetros del vocabulario y la secuencia
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 130

In [None]:
# Llamada correcta a la función division_datos
x_train, x_test, x_val, y_train, y_test, y_val = division_datos(tfidf=True)

In [None]:
# Modelo LSTM standard

modelo = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=64, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(50, activation='relu', input_shape=(7, 1)),
    Dense(1)
    ])

modelo.compile(optimizer='adam', loss='mse')
history = modelo.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))

In [None]:
699/699 ━━━━━━━━━━━━━━━━━━━━ 54s 72ms/step - loss: 0.0636 - val_loss: 0.0649
Epoch 2/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 52s 75ms/step - loss: 0.0658 - val_loss: 0.0650
Epoch 3/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 81s 73ms/step - loss: 0.0629 - val_loss: 0.0649
Epoch 4/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 52s 74ms/step - loss: 0.0654 - val_loss: 0.0649
Epoch 5/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 79s 69ms/step - loss: 0.0632 - val_loss: 0.0650
Epoch 6/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 88s 77ms/step - loss: 0.0644 - val_loss: 0.0649
Epoch 7/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 79s 73ms/step - loss: 0.0650 - val_loss: 0.0649
Epoch 8/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 83s 75ms/step - loss: 0.0650 - val_loss: 0.0649
Epoch 9/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 52s 74ms/step - loss: 0.0644 - val_loss: 0.0649
Epoch 10/10
699/699 ━━━━━━━━━━━━━━━━━━━━ 82s 74ms/step - loss: 0.0663 - val_loss: 0.0650

In [None]:
#Compilar

modelo.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# Instantiate SMOTE
smote = SMOTE(random_state=42)  # You can adjust the random_state

# Resample the training data
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Original shape:", x_train.shape, y_train.shape)
print("Resampled shape:", X_train_resampled.shape, y_train_resampled.shape)

In [None]:
#Entrenar con pesos de clase

history = modelo.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(x_test, y_test),
    epochs=10,
    batch_size=32
)

In [None]:
Epoch 1/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 121s 90ms/step - accuracy: 0.5046 - loss: 0.7575 - val_accuracy: 0.9303 - val_loss: 0.6741
Epoch 2/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 130s 81ms/step - accuracy: 0.5001 - loss: 0.6941 - val_accuracy: 0.0697 - val_loss: 0.6944
Epoch 3/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 143s 82ms/step - accuracy: 0.5070 - loss: 0.6939 - val_accuracy: 0.9303 - val_loss: 0.6714
Epoch 4/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 141s 81ms/step - accuracy: 0.4957 - loss: 0.6944 - val_accuracy: 0.9303 - val_loss: 0.6778
Epoch 5/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 110s 84ms/step - accuracy: 0.4990 - loss: 0.6943 - val_accuracy: 0.0697 - val_loss: 0.6981
Epoch 6/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 140s 83ms/step - accuracy: 0.5019 - loss: 0.6941 - val_accuracy: 0.0697 - val_loss: 0.7346
Epoch 7/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 139s 80ms/step - accuracy: 0.5004 - loss: 0.6939 - val_accuracy: 0.0697 - val_loss: 0.6970
Epoch 8/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 145s 82ms/step - accuracy: 0.5015 - loss: 0.6940 - val_accuracy: 0.0697 - val_loss: 0.7222
Epoch 9/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 139s 80ms/step - accuracy: 0.5008 - loss: 0.6939 - val_accuracy: 0.9303 - val_loss: 0.6712
Epoch 10/10
1301/1301 ━━━━━━━━━━━━━━━━━━━━ 145s 82ms/step - accuracy: 0.5002 - loss: 0.6939 - val_accuracy: 0.9303 - val_loss: 0.6762

In [None]:
# Evaluar en el set de prueba
loss, accuracy = modelo.evaluate(x_test, y_test)
print(f"Loss en test: {loss:.4f}")
print(f"Accuracy en test: {accuracy:.4f}")

In [None]:
175/175 ━━━━━━━━━━━━━━━━━━━━ 3s 20ms/step - accuracy: 0.9279 - loss: 0.6763
Loss en test: 0.6762
Accuracy en test: 0.9303

In [None]:
#Ver métricas más completas (confusion matrix, precision, recall, F1)

# Predicciones
y_pred = modelo.predict(x_test)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Reporte
print(classification_report(y_test, y_pred_classes))

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.show()

In [None]:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5203
           1       0.00      0.00      0.00       390

    accuracy                           0.93      5593
   macro avg       0.47      0.50      0.48      5593
weighted avg       0.87      0.93      0.90      5593

In [None]:
# Visualizar el entrenamiento

plt.plot(history.history['accuracy'], label='Accuracy entrenamiento')
plt.plot(history.history['val_accuracy'], label='Accuracy validación')
plt.xlabel('Épocas')
plt.ylabel('Precisión')
plt.title('Precisión durante el entrenamiento')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Loss entrenamiento')
plt.plot(history.history['val_loss'], label='Loss validación')
plt.xlabel('Épocas')
plt.ylabel('Pérdida')
plt.title('Pérdida durante el entrenamiento')
plt.legend()
plt.show()