In [45]:
import pandas as pd

import spacy
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm


In [46]:
path_data = "../Resources/RNN_Movie_Db.csv"
nlp = spacy.load('en_core_web_sm')
glove_path = "../Resources/glove.6B.300d.txt"
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 300

In [47]:
def load_dataset(path):
    df = pd.read_csv(path)
    df = df.dropna().reset_index(drop=True)
    df['text'] = clean_texts(df['text'].astype(str).values)
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

In [52]:
def clean_texts(texts):
    cleaned = []
    total_tokens = 0
    total_resenas = len(texts)
    for doc in tqdm(nlp.pipe((str(t).lower() for t in texts), batch_size=64), total=total_resenas, desc="Procesando reseñas"):
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        cleaned.append(" ".join(tokens))
        total_tokens += len(tokens)
    print(f"Total tokens procesados: {total_tokens}")
    return cleaned

In [49]:
def tokenize_and_pad(texts, vocab_size=10000, max_len=200):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded, tokenizer

In [50]:
def load_glove_embeddings(glove_file_path, vocab_size, tokenizer, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, index in tokenizer.word_index.items():
        if index < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

    return embedding_matrix

In [None]:
df = load_dataset(path_data)

Procesando reseñas:   3%|▎         | 1153/40000 [00:47<26:29, 24.44it/s]

In [None]:
X, tokenizer = tokenize_and_pad(df['text'].values, vocab_size=VOCAB_SIZE, max_len=MAX_LEN)
y = df['label'].values

In [None]:
embedding_matrix = load_glove_embeddings(glove_path, vocab_size=VOCAB_SIZE, tokenizer=tokenizer, embedding_dim=EMBEDDING_DIM)

In [None]:
model = Sequential()
model.add(Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_LEN,
    trainable=False
))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compilar el modelo
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Mostrar resumen del modelo
model.summary()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_data=(X_val, y_val)
)

In [None]:
y_pred_prob = model.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype("int32")

In [None]:
print(classification_report(y_val, y_pred, digits=4))

In [None]:
cm = confusion_matrix(y_val, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Negativa", "Positiva"], yticklabels=["Negativa", "Positiva"])
plt.xlabel('Predicción')
plt.ylabel('Real')
plt.title('Matriz de Confusión')
plt.show()