In [None]:
#====================================================================================================#
#                                                                                                    #
#                                                        ██╗   ██╗   ████████╗ █████╗ ██████╗        #
#      Competición - INAR AEC2                           ██║   ██║   ╚══██╔══╝██╔══██╗██╔══██╗       #
#                                                        ██║   ██║█████╗██║   ███████║██║  ██║       #
#      created:        04/12/2025  -  09:30:00           ██║   ██║╚════╝██║   ██╔══██║██║  ██║       #
#      last change:    04/12/2025  -  11:15:45           ╚██████╔╝      ██║   ██║  ██║██████╔╝       #
#                                                         ╚═════╝       ╚═╝   ╚═╝  ╚═╝╚═════╝        #
#                                                                                                    #
#      Ismael Hernandez Clemente                         ismael.hernandez@live.u-tad.com             #
#                                                                                                    #
#      Github:                                           https://github.com/ismaelucky342            #
#                                                                                                    #
#====================================================================================================#

# Iteración 1: Modelo Baseline (Bi-LSTM)
**Estado: DEPRECATED**

En este primer experimento, he implementado una arquitectura base utilizando una LSTM bidireccional. El objetivo es establecer un punto de referencia (baseline) para comparar futuras mejoras. He utilizado una configuración estándar de preprocesamiento y entrenamiento.

In [None]:
# Importo las bibliotecas necesarias para el procesamiento de datos, visualización y modelado con TensorFlow/Keras.
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'  
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
import warnings
warnings.filterwarnings('ignore')
seed = 42
np.random.seed(seed)
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import (
    Embedding, LSTM, Bidirectional, Dense, Dropout, 
    GlobalMaxPooling1D, Conv1D, SpatialDropout1D
)
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)

In [None]:
# Defino los hiperparámetros para el modelo, entrenamiento y preprocesamiento de texto.
MAX_WORDS = 10000  
MAX_LEN = 200  
EMBEDDING_DIM = 100  
LSTM_UNITS = 128  
DENSE_UNITS = 64  
DROPOUT_RATE = 0.5  
SPATIAL_DROPOUT = 0.2  
BATCH_SIZE = 32
EPOCHS = 50  
VALIDATION_SPLIT = 0.2
LEARNING_RATE = 1e-3

In [None]:
# Cargo los datos de entrenamiento y muestro las primeras filas.
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")
train.head(10)

In [None]:
# Analizo la distribución de la longitud de los textos y comparo por clase.
train['text_length'] = train['text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(train['text_length'], bins=50, edgecolor='black', alpha=0.7)
plt.title('Distribución de Longitud de Textos')
plt.xlabel('Número de palabras')
plt.ylabel('Frecuencia')
plt.axvline(train['text_length'].mean(), color='red', linestyle='--', label=f'Media: {train["text_length"].mean():.1f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 2)
train.groupby('spam_label')['text_length'].hist(bins=30, alpha=0.7, label=['Not SPAM', 'SPAM'])
plt.title('Longitud por Clase')
plt.xlabel('Número de palabras')
plt.ylabel('Frecuencia')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 3)
sns.boxplot(data=train, x='spam_label', y='text_length')
plt.title('Boxplot Longitud por Clase')
plt.xlabel('Clase (0=Not SPAM, 1=SPAM)')
plt.ylabel('Número de palabras')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Tokenizo los textos y preparo los datos de entrenamiento y validación.
X_train_text = train['text'].values
y_train = train['spam_label'].values
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")  
tokenizer.fit_on_texts(X_train_text)
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, 
    test_size=VALIDATION_SPLIT, 
    random_state=seed,
    stratify=y_train  
)

In [None]:
# Construyo el modelo Bi-LSTM y lo compilo.
def build_model():
    model = Sequential([
        Embedding(
            input_dim=MAX_WORDS,
            output_dim=EMBEDDING_DIM,
            input_length=MAX_LEN,
            name='embedding'
        ),
        SpatialDropout1D(SPATIAL_DROPOUT),
        Bidirectional(LSTM(LSTM_UNITS, return_sequences=True), name='bidirectional_lstm'),
        GlobalMaxPooling1D(),
        Dense(DENSE_UNITS, activation='relu', name='dense_1'),
        Dropout(DROPOUT_RATE),  
        Dense(1, activation='sigmoid', name='output')
    ], name='spam_classifier_v1')
    return model
model = build_model()
model.compile(
    optimizer=keras.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc')
    ]
)
model.summary()

In [None]:
# Configuro los callbacks y entreno el modelo.
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,  
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        'best_spam_model.keras',
        monitor='val_loss',
        save_best_only=True,  
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,  
        patience=3,
        min_lr=1e-6,
        verbose=1
    )
]
history = model.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,  
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Predigo en el conjunto de validación y calculo el MCC.
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
mcc_score = matthews_corrcoef(y_val, y_pred)
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not SPAM', 'SPAM'],
            yticklabels=['Not SPAM', 'SPAM'])
plt.title(f'Confusion Matrix (MCC: {mcc_score:.4f})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Genero las curvas de aprendizaje para analizar el comportamiento del modelo baseline.
def plot_learning_curves(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

plot_learning_curves(history)

In [None]:
# Cargo los datos de prueba.
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")
test.head()

In [None]:
# Tokenizo los textos de prueba y hago predicciones.
X_test_text = test['text'].values
X_test_seq = tokenizer.texts_to_sequences(X_test_text)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_pred_proba_test = model.predict(X_test_pad, batch_size=BATCH_SIZE)
y_pred_test = (y_pred_proba_test > 0.5).astype(int).flatten()

In [None]:
# Creo el archivo de submission.
submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_pred_test
submission.to_csv('submission.csv', index=False)

## Análisis de la Iteración 1
Este modelo baseline ha logrado un MCC de aproximadamente 0.86. Aunque es un buen comienzo, las curvas de aprendizaje muestran un overfitting claro: la pérdida de entrenamiento sigue bajando mientras que la de validación comienza a subir o se estanca. 

**Conclusión:** Necesito introducir técnicas de regularización (Dropout, L2) en la siguiente iteración para mejorar la generalización del modelo.