In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'


In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
import warnings
warnings.filterwarnings('ignore')  
seed = 42
np.random.seed(seed)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Embedding, Dense, Dropout, LSTM, Bidirectional, 
    Conv1D, GlobalMaxPooling1D, Concatenate, Input, SpatialDropout1D  
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)

In [None]:
MAX_WORDS = 50000  
MAX_LEN = 200  
EMBEDDING_DIM = 128  
CNN_FILTERS = 128  
CNN_KERNEL_SIZES = [2, 3, 4, 5]  
LSTM_UNITS = 96  
DENSE_UNITS = 64  
SPATIAL_DROPOUT = 0.3  
DROPOUT = 0.5  
L2_REG = 1e-4  
BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 1e-3  
VALIDATION_SPLIT = 0.2

In [None]:
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])
X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train
)

In [None]:
def build_cnn_lstm_model():
    """
    Modelo hibrido CNN + LSTM
    La arquitectura:
    1. Input + Embedding + SpatialDropout
    2. RAMA CNN: Varios Conv1D con diferentes kernel_sizes (para capturar n-gramas de diferentes tamaÃ±os)
    3. RAMA LSTM: Bi-LSTM para capturar contexto secuencial
    4. Concateno las features de CNN y LSTM
    5. Clasificador Dense con dropout
    """
    inputs = Input(shape=(MAX_LEN,))
    x = Embedding(
        input_dim=MAX_WORDS,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LEN
    )(inputs)
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)  
    cnn_outputs = []
    for kernel_size in CNN_KERNEL_SIZES:
        conv = Conv1D(
            filters=CNN_FILTERS,
            kernel_size=kernel_size,
            activation='relu',
            kernel_regularizer=l2(L2_REG),
            padding='same'
        )(x)
        pool = GlobalMaxPooling1D()(conv)
        cnn_outputs.append(pool)
    lstm_out = Bidirectional(
        LSTM(
            LSTM_UNITS,
            kernel_regularizer=l2(L2_REG),
            recurrent_regularizer=l2(L2_REG),
            dropout=0.3,
            recurrent_dropout=0.3
        )
    )(x)
    concatenated = Concatenate()(cnn_outputs + [lstm_out])
    dense = Dense(
        DENSE_UNITS,
        activation='relu',
        kernel_regularizer=l2(L2_REG)
    )(concatenated)
    dense = Dropout(DROPOUT)(dense)
    outputs = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=inputs, outputs=outputs, name='CNN_LSTM_Hybrid_V5')
    return model
model = build_cnn_lstm_model()
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()]
)
model.summary()

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_cnn_lstm_model.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
]
history = model.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

In [None]:
y_pred_proba = model.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
mcc_score = matthews_corrcoef(y_val, y_pred)
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'CNN+LSTM Hybrid (MCC: {mcc_score:.4f})')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()

In [None]:
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")
X_test_seq = tokenizer.texts_to_sequences(test['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_pred_test = model.predict(X_test_pad, batch_size=BATCH_SIZE, verbose=0)
y_pred_test = (y_pred_test > 0.5).astype(int).flatten()
submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_pred_test
submission.to_csv('submission.csv', index=False)
