In [None]:
#====================================================================================================#
#                                                                                                    #
#                                                        ██╗   ██╗   ████████╗ █████╗ ██████╗        #
#      Competición - SPAM/NOT SPAM - ITERACIÓN 5        ██║   ██║   ╚══██╔══╝██╔══██╗██╔══██╗       #
#      HYBRID CNN + LSTM ARCHITECTURE                   ██║   ██║█████╗██║   ███████║██║  ██║       #
#      created:        05/12/2025  -  12:00:00           ██║   ██║╚════╝██║   ██╔══██║██║  ██║       #
#      last change:    05/12/2025  -  12:00:00           ╚██████╔╝      ██║   ██║  ██║██████╔╝       #
#                                                         ╚═════╝       ╚═╝   ╚═╝  ╚═╝╚═════╝        #
#                                                                                                    #
#      Ismael Hernandez Clemente                         ismael.hernandez@live.u-tad.com             #
#                                                                                                    #
#      Github:                                           https://github.com/ismaelucky342            #
#                                                                                                    #
#      ARQUITECTURA HÍBRIDA CNN+LSTM:                                                               #
#      - CNN: Captura n-gramas locales (patrones spam típicos)                                     #
#      - LSTM: Captura dependencias largas (contexto)                                              #
#      - Múltiples kernel sizes (2,3,4,5) para diferentes n-gramas                                 #
#      - Concatenación de features CNN + LSTM                                                      #
#      - Dropout y L2 regularization                                                                #
#      - Basado en arquitectura exitosa V2 pero con CNN añadido                                    #
#                                                                                                    #
#====================================================================================================#

In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

seed = 42
np.random.seed(seed)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Embedding, Dense, Dropout, LSTM, Bidirectional, 
    Conv1D, GlobalMaxPooling1D, Concatenate, Input, SpatialDropout1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)

from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)

## Iteración 5 - ARQUITECTURA HÍBRIDA CNN+LSTM

**Post-mortem V4:**
- DistilBERT falló estrepitosamente (0.6456 vs 0.8885)
- Transfer learning no siempre funciona
- Dataset spam-specific requiere especialización

**Nueva estrategia V5:**
1. **CNN**: Detecta patrones locales ("free money", "click here", etc.)
2. **LSTM**: Mantiene contexto largo (lo que funcionó en V2)
3. **Híbrido**: Combina ambos → mejor de dos mundos
4. **Base V2**: Parte de hiperparámetros exitosos de V2

**Objetivo:** MCC > 0.89 (superar V2 sin el riesgo de DistilBERT)

In [None]:
# Hyperparameters - V5 HYBRID CNN+LSTM - BALANCED REGULARIZATION

# Text processing
MAX_WORDS = 50000
MAX_LEN = 200  # Restaurado: 250 era demasiado

# Embedding
EMBEDDING_DIM = 128  # Reducido: 150 sobreajusta

# CNN branch - CAPACIDAD MODERADA
CNN_FILTERS = 128  # Restaurado: 256 era excesivo
CNN_KERNEL_SIZES = [2, 3, 4, 5]  # Sin 6-grams

# LSTM branch - CAPACIDAD CONTROLADA
LSTM_UNITS = 96  # Restaurado: balance capacidad/generalización

# Dense classifier
DENSE_UNITS = 64  # Restaurado: 128 causaba overfitting

# Regularization - MÁS FUERTE (anti-overfitting)
SPATIAL_DROPOUT = 0.3
DROPOUT = 0.5  # Aumentado de nuevo
L2_REG = 1e-4  # Aumentado: más penalización

# Training - MÁS CONSERVADOR
BATCH_SIZE = 32
EPOCHS = 15  # Más epochs pero con early stopping fuerte
LEARNING_RATE = 1e-3  # Reducido: 2e-3 converge demasiado rápido
VALIDATION_SPLIT = 0.2

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")

# Tokenization
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])

X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values

# Train/val split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train
)

## Arquitectura Híbrida

```
Input → Embedding → SpatialDropout
         ↓                ↓
    CNN Branch      LSTM Branch
         ↓                ↓
  [Conv1D(2)] ← [Bi-LSTM(96)]
  [Conv1D(3)]        ↓
  [Conv1D(4)]    GlobalMaxPool
  [Conv1D(5)]
         ↓
  [GlobalMaxPool x4]
         ↓
    Concatenate
         ↓
   Dense(64) + Dropout
         ↓
    Dense(1, sigmoid)
```

In [None]:
def build_cnn_lstm_model():
    # Input
    inputs = Input(shape=(MAX_LEN,))
    
    # Embedding
    x = Embedding(
        input_dim=MAX_WORDS,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LEN
    )(inputs)
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)
    
    # CNN Branch: CAPA ÚNICA por kernel (evitar overfitting)
    cnn_outputs = []
    for kernel_size in CNN_KERNEL_SIZES:
        conv = Conv1D(
            filters=CNN_FILTERS,
            kernel_size=kernel_size,
            activation='relu',
            kernel_regularizer=l2(L2_REG),
            padding='same'
        )(x)
        pool = GlobalMaxPooling1D()(conv)
        cnn_outputs.append(pool)
    
    # LSTM Branch: CAPA ÚNICA (evitar overfitting)
    lstm_out = Bidirectional(
        LSTM(
            LSTM_UNITS,
            kernel_regularizer=l2(L2_REG),
            recurrent_regularizer=l2(L2_REG),
            dropout=0.3,
            recurrent_dropout=0.3
        )
    )(x)
    
    # Concatenate CNN and LSTM features
    concatenated = Concatenate()(cnn_outputs + [lstm_out])
    
    # CLASIFICADOR SIMPLE (evitar overfitting)
    dense = Dense(
        DENSE_UNITS,
        activation='relu',
        kernel_regularizer=l2(L2_REG)
    )(concatenated)
    dense = Dropout(DROPOUT)(dense)
    
    outputs = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=inputs, outputs=outputs, name='CNN_LSTM_Hybrid_V5_Balanced')
    return model

model = build_cnn_lstm_model()

# Optimizer CONSERVADOR
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()]
)

model.summary()

In [None]:
# Training - EARLY STOPPING MÁS AGRESIVO
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_cnn_lstm_model.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
]

history = model.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Evaluation
y_pred_proba = model.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
mcc_score = matthews_corrcoef(y_val, y_pred)

print(f"MCC: {mcc_score:.4f}")
print(classification_report(y_val, y_pred, target_names=['Not SPAM', 'SPAM']))

# Confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'CNN+LSTM Hybrid (MCC: {mcc_score:.4f})')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Test predictions
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")

X_test_seq = tokenizer.texts_to_sequences(test['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

y_pred_test = model.predict(X_test_pad, batch_size=BATCH_SIZE, verbose=0)
y_pred_test = (y_pred_test > 0.5).astype(int).flatten()

submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_pred_test
submission.to_csv('submission.csv', index=False)

print(f"Submission created. Test predictions: {len(y_pred_test)}")

In [None]:
# Resumen comparativo
print("="*80)
print("EVOLUCIÓN COMPLETA")
print("="*80)
print("V1 LSTM: 0.8665")
print("V2 LSTM+L2: 0.8885 ← BEST LSTM")
print("V3 LSTM+L2x5: 0.8733 (over-regularized)")
print("V4 DistilBERT: 0.6456 ← FRACASO")
print(f"V5 CNN+LSTM: {mcc_score:.4f}")
print("="*80)
if mcc_score > 0.8885:
    print(f"ÉXITO: +{(mcc_score-0.8885):.4f} sobre V2")
elif mcc_score > 0.88:
    print("Resultado competitivo con V2")
else:
    print("V2 sigue siendo mejor. Usar V2 para submission final.")
print("="*80)