In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import sys
try:
    import google.protobuf
    if hasattr(google.protobuf, '__version__'):
except:
    pass


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
seed = 42
np.random.seed(seed)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, LSTM, Bidirectional, Dense, Dropout, 
    GlobalMaxPooling1D, Conv1D, SpatialDropout1D
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)
pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)


In [None]:
MAX_WORDS = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
LSTM_UNITS = 64           DENSE_UNITS = 32          DROPOUT_RATE = 0.7        SPATIAL_DROPOUT = 0.4     L2_REG = 5e-4             BATCH_SIZE = 32
EPOCHS = 50
VALIDATION_SPLIT = 0.2
LEARNING_RATE = 5e-4      CLIPNORM = 1.0            estimated_params_v3 = (
    MAX_WORDS * EMBEDDING_DIM +
    4 * LSTM_UNITS * (EMBEDDING_DIM + LSTM_UNITS + 1) * 2 +
    (LSTM_UNITS * 2) * DENSE_UNITS + DENSE_UNITS +
    DENSE_UNITS + 1
)


In [None]:
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")
train.head(10)

In [None]:
train['text_length'] = train['text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(train['text_length'], bins=50, edgecolor='black', alpha=0.7)
plt.title('Distribución de Longitud de Textos')
plt.xlabel('Número de palabras')
plt.ylabel('Frecuencia')
plt.axvline(train['text_length'].mean(), color='red', linestyle='--', label=f'Media: {train["text_length"].mean():.1f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
sns.boxplot(data=train, x='spam_label', y='text_length')
plt.title('Longitud por Clase')
plt.xlabel('Clase (0=Not SPAM, 1=SPAM)')
plt.ylabel('Número de palabras')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
X_train_text = train['text'].values
y_train = train['spam_label'].values
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, 
    test_size=VALIDATION_SPLIT, 
    random_state=seed,
    stratify=y_train
)


In [None]:
def build_model_v3():
    model = Sequential([
        Embedding(
            input_dim=MAX_WORDS,
            output_dim=EMBEDDING_DIM,
            input_length=MAX_LEN,
            name='embedding'
        ),
        SpatialDropout1D(SPATIAL_DROPOUT),
        Bidirectional(
            LSTM(
                LSTM_UNITS, 
                return_sequences=True,
                kernel_regularizer=l2(L2_REG),
                recurrent_regularizer=l2(L2_REG),
                bias_regularizer=l2(L2_REG)                 ), 
            name='bidirectional_lstm'
        ),
        GlobalMaxPooling1D(),
        Dense(
            DENSE_UNITS, 
            activation='relu',
            kernel_regularizer=l2(L2_REG),
            bias_regularizer=l2(L2_REG),                    name='dense_1'
        ),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid', name='output')
    ], name='spam_classifier_v3_shock_therapy')
    return model
model = build_model_v3()
optimizer = keras.optimizers.AdamW(
    learning_rate=LEARNING_RATE,
    weight_decay=1e-4,
    clipnorm=CLIPNORM  )
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc')
    ]
)
model.build(input_shape=(None, MAX_LEN))
model.summary()


In [None]:
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=2,          restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        'best_spam_model_v3.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=1,          min_lr=1e-6,
        verbose=1
    )
]
history = model.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)


In [None]:
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
mcc_score = matthews_corrcoef(y_val, y_pred)
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Not SPAM', 'SPAM'],
            yticklabels=['Not SPAM', 'SPAM'])
plt.title(f'Confusion Matrix V3 - Shock Therapy (MCC: {mcc_score:.4f})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
final_train_loss = history.history['loss'][-1]
final_val_loss = history.history['val_loss'][-1]
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
overfitting_delta = abs(final_val_loss - final_train_loss)
v1_data = {'train_loss': 0.0055, 'val_loss': 0.1895, 'mcc': 0.8665, 'params': 1251009}
v2_data = {'train_loss': 0.0411, 'val_loss': 0.2075, 'mcc': 0.8885, 'params': 1160609}
v3_data = {
    'train_loss': final_train_loss, 
    'val_loss': final_val_loss, 
    'mcc': mcc_score,
    'params': model.count_params()
}
v1_delta = abs(v1_data['val_loss'] - v1_data['train_loss'])
v2_delta = abs(v2_data['val_loss'] - v2_data['train_loss'])
v3_delta = overfitting_delta
success_v3 = [
    ("Overfitting Delta < 0.08", v3_delta < 0.08),
    ("MCC >= 0.87", v3_data['mcc'] >= 0.87),
    ("Val Loss < 0.14", v3_data['val_loss'] < 0.14),
    ("Train Loss > 0.05", v3_data['train_loss'] > 0.05),
    ("Mejora vs V2", v3_delta < v2_delta)
]
for criterion, success in success_v3:
    status = "OK" if success else "FAIL"
total_success = sum([s for _, s in success_v3])
if total_success >= 4:
elif total_success >= 3:
else:


In [None]:
def plot_learning_curves(history, title_prefix=""):
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes[0, 0].plot(history.history['loss'], label='Train Loss', linewidth=2, color='red')
    axes[0, 0].plot(history.history['val_loss'], label='Val Loss', linewidth=2, color='darkred')
    axes[0, 0].set_title(f'{title_prefix} Loss', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 1].plot(history.history['accuracy'], label='Train Acc', linewidth=2, color='red')
    axes[0, 1].plot(history.history['val_accuracy'], label='Val Acc', linewidth=2, color='darkred')
    axes[0, 1].set_title(f'{title_prefix} Accuracy', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    axes[1, 0].plot(history.history['precision'], label='Train Prec', linewidth=2, color='red')
    axes[1, 0].plot(history.history['val_precision'], label='Val Prec', linewidth=2, color='darkred')
    axes[1, 0].set_title(f'{title_prefix} Precision', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 1].plot(history.history['recall'], label='Train Recall', linewidth=2, color='red')
    axes[1, 1].plot(history.history['val_recall'], label='Val Recall', linewidth=2, color='darkred')
    axes[1, 1].set_title(f'{title_prefix} Recall', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Recall')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
plot_learning_curves(history, title_prefix="V3 Shock Therapy")

In [None]:
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")
test.head()

In [None]:
X_test_text = test['text'].values
X_test_seq = tokenizer.texts_to_sequences(X_test_text)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_pred_proba_test = model.predict(X_test_pad, batch_size=BATCH_SIZE)
y_pred_test = (y_pred_proba_test > 0.5).astype(int).flatten()


In [None]:
submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_pred_test
submission.to_csv('submission.csv', index=False)
submission.head(10)

In [None]:
comparison_df = pd.DataFrame({
    'Métrica': [
        'Validation MCC',
        'Train Loss',
        'Val Loss',
        'Overfitting Delta',
        'Train Accuracy',
        'Val Accuracy',
        'Total Parameters',
        'Training Epochs'
    ],
    'V1 Baseline': [
        '0.8665',
        '0.0055',
        '0.1895',
        '0.1840',
        '99.91%',
        '95.77%',
        '1,251,009',
        '8'
    ],
    'V2 Moderate': [
        '0.8885',
        '0.0411',
        '0.2075',
        '0.1663',
        '99.56%',
        '95.07%',
        '1,160,609',
        '6'
    ],
    'V3 Shock': [
        f'{mcc_score:.4f}',
        f'{final_train_loss:.4f}',
        f'{final_val_loss:.4f}',
        f'{overfitting_delta:.4f}',
        f'{final_train_acc:.2%}',
        f'{final_val_acc:.2%}',
        f'{model.count_params():,}',
        f'{len(history.history["loss"])}'
    ]
})
