In [None]:
# Iteracion 5 - Mi intento con arquitectura hibrida CNN + LSTM
# Despues del desastre de DistilBERT (V4), volvi a LSTM pero con un twist
# 多Y si combino CNN (para capturar patrones locales) con LSTM (para el contexto)?

# La idea:
# - CNN: Captura n-gramas tipo "click here", "free money", "urgent!!!" (patrones tipicos de spam)
# - LSTM: Captura el contexto general del mensaje
# - Junto las features de ambos y clasifico

# Spoiler: Funciono mejor que V4 pero no supero a V2 (0.81-0.86 vs 0.8885)

In [None]:
# Variables de entorno para silenciar warnings molestos
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
# Imports basicos
import pandas as pd  # CSV
import numpy as np  # Operaciones numericas
import matplotlib.pyplot as plt  # Graficos
import seaborn as sns  # Graficos bonitos
import warnings
warnings.filterwarnings('ignore')  # Nada de warnings molestos

seed = 42
np.random.seed(seed)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Embedding, Dense, Dropout, LSTM, Bidirectional, 
    Conv1D, GlobalMaxPooling1D, Concatenate, Input, SpatialDropout1D  # NUEVO: CNN layers
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)

from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)

In [None]:
# Hyperparametros - V5 Arquitectura Hibrida CNN+LSTM
# Probe MUCHAS combinaciones diferentes (esto fue un experimento largo)

# Text processing
MAX_WORDS = 50000  # Aumente el vocabulario para capturar mas patrones
MAX_LEN = 200  # Vuelvo a 200 (250 era demasiado)

# Embedding
EMBEDDING_DIM = 128  # 150 sobreajustaba, bajo a 128

# CNN branch - Para capturar n-gramas
CNN_FILTERS = 128  # Numero de filtros CNN
CNN_KERNEL_SIZES = [2, 3, 4, 5]  # Capturo bigramas, trigramas, 4-gramas y 5-gramas

# LSTM branch - Para capturar contexto
LSTM_UNITS = 96  # Igual que V2 (funcionaba bien)

# Dense classifier
DENSE_UNITS = 64  # Reducido para evitar overfitting

# Regularization - Fuerte para controlar overfitting
SPATIAL_DROPOUT = 0.3  # Dropout espacial en embeddings
DROPOUT = 0.5  # Dropout en clasificador
L2_REG = 1e-4  # Regularizacion L2

# Training
BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 1e-3  # Learning rate bajito
VALIDATION_SPLIT = 0.2

In [None]:
# Cargo los datos y preparo las secuencias
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")

# Tokenizo con vocabulario ampliado
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])

X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values

# Divido train/validation
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train
)

In [None]:
def build_cnn_lstm_model():
    """
    Modelo hibrido CNN + LSTM
    
    La arquitectura:
    1. Input + Embedding + SpatialDropout
    2. RAMA CNN: Varios Conv1D con diferentes kernel_sizes (para capturar n-gramas de diferentes tama単os)
    3. RAMA LSTM: Bi-LSTM para capturar contexto secuencial
    4. Concateno las features de CNN y LSTM
    5. Clasificador Dense con dropout
    """
    # Input
    inputs = Input(shape=(MAX_LEN,))
    
    # Capa embedding compartida por CNN y LSTM
    x = Embedding(
        input_dim=MAX_WORDS,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LEN
    )(inputs)
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)  # Dropout espacial para regularizar embeddings
    
    # RAMA CNN: Creo una Conv1D por cada kernel_size
    cnn_outputs = []
    for kernel_size in CNN_KERNEL_SIZES:
        # Conv1D para capturar n-gramas de tama単o kernel_size
        conv = Conv1D(
            filters=CNN_FILTERS,
            kernel_size=kernel_size,
            activation='relu',
            kernel_regularizer=l2(L2_REG),
            padding='same'
        )(x)
        # GlobalMaxPooling para quedarse con la feature mas importante
        pool = GlobalMaxPooling1D()(conv)
        cnn_outputs.append(pool)
    
    # RAMA LSTM: Bi-LSTM para capturar dependencias largas
    lstm_out = Bidirectional(
        LSTM(
            LSTM_UNITS,
            kernel_regularizer=l2(L2_REG),
            recurrent_regularizer=l2(L2_REG),
            dropout=0.3,
            recurrent_dropout=0.3
        )
    )(x)
    
    # Junto todas las features (CNN + LSTM)
    concatenated = Concatenate()(cnn_outputs + [lstm_out])
    
    # Clasificador final
    dense = Dense(
        DENSE_UNITS,
        activation='relu',
        kernel_regularizer=l2(L2_REG)
    )(concatenated)
    dense = Dropout(DROPOUT)(dense)
    
    outputs = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=inputs, outputs=outputs, name='CNN_LSTM_Hybrid_V5')
    return model

# Construyo el modelo
model = build_cnn_lstm_model()

# Compilo con Adam
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()]
)

model.summary()

In [None]:
# Entreno con early stopping mas agresivo (para evitar overfitting)
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_cnn_lstm_model.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
]

history = model.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Evaluo en validation
y_pred_proba = model.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
mcc_score = matthews_corrcoef(y_val, y_pred)

print(f"MCC: {mcc_score:.4f}")
print(classification_report(y_val, y_pred, target_names=['Not SPAM', 'SPAM']))

# Matriz de confusion
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'CNN+LSTM Hybrid (MCC: {mcc_score:.4f})')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Predigo en test y creo submission
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")

X_test_seq = tokenizer.texts_to_sequences(test['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

y_pred_test = model.predict(X_test_pad, batch_size=BATCH_SIZE, verbose=0)
y_pred_test = (y_pred_test > 0.5).astype(int).flatten()

submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_pred_test
submission.to_csv('submission.csv', index=False)

print(f"Submission created. Test predictions: {len(y_pred_test)}")
print("\nResultado V5:")
print("MCC publico: Entre 0.81-0.86 (vario segun los hyperparametros)")
print("No supero a V2 (0.8885) pero fue mejor que V4 (0.6456)")
print("\nLecciones:")
print("  - La arquitectura hibrida funciona pero a単ade complejidad")
print("  - Dificil balancear CNN y LSTM para no overfittear")
print("  - A veces mas simple (solo LSTM) es mejor")