In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

seed = 42
np.random.seed(seed)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)

from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 150)

## Iteraci√≥n 6 - ENSEMBLE

**Post-mortem V1-V5:**
- V1: 0.8665 (baseline, overfitting)
- V2: 0.8885 ‚Üê BEST single model
- V3: 0.8733 (complementario a V2)
- V4: 0.6456 (DistilBERT fracaso)
- V5: 0.8593 (CNN+LSTM no mejora)

**Estrategia V6:**
1. **Entrenar V2 y V3** con misma seed para reproducibilidad
2. **Weighted ensemble**: V2 (60%) + V3 (40%)
3. **Threshold optimization**: Buscar umbral √≥ptimo > 0.5

**Objetivo:** MCC > 0.90 (aprovechar complementariedad)

In [None]:
# Shared config
MAX_WORDS = 50000
MAX_LEN = 200
EMBEDDING_DIM = 128
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2

# V2 config (moderate regularization)
V2_LSTM_UNITS = 96
V2_DROPOUT = 0.4
V2_L2_REG = 5e-5
V2_EPOCHS = 10
V2_LR = 1e-3

# V3 config (extreme regularization)
V3_LSTM_UNITS = 64
V3_DROPOUT = 0.5
V3_L2_REG = 1e-4
V3_EPOCHS = 10
V3_LR = 1e-3

# Ensemble weights
V2_WEIGHT = 0.6  # V2 es mejor individualmente
V3_WEIGHT = 0.4

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")

# Tokenization
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])

X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values

# Train/val split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_pad, y_train, test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train
)

print(f"Train: {len(X_train_final)}, Val: {len(X_val)}")

## Modelo V2 - Moderate Regularization

In [None]:
def build_v2_model():
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
        Bidirectional(LSTM(
            V2_LSTM_UNITS,
            kernel_regularizer=l2(V2_L2_REG),
            recurrent_regularizer=l2(V2_L2_REG)
        )),
        Dropout(V2_DROPOUT),
        Dense(1, activation='sigmoid')
    ], name='V2_LSTM_Moderate')
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=V2_LR),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

model_v2 = build_v2_model()
model_v2.summary()

In [None]:
# Train V2
callbacks_v2 = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]

print("\n" + "="*80)
print("TRAINING V2 (Moderate Regularization)")
print("="*80)

history_v2 = model_v2.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=V2_EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks_v2,
    verbose=1
)

# V2 predictions
y_pred_v2_proba = model_v2.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
y_pred_v2 = (y_pred_v2_proba > 0.5).astype(int).flatten()
mcc_v2 = matthews_corrcoef(y_val, y_pred_v2)

print(f"\nV2 Validation MCC: {mcc_v2:.4f}")

## Modelo V3 - Extreme Regularization

In [None]:
def build_v3_model():
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
        Bidirectional(LSTM(
            V3_LSTM_UNITS,
            kernel_regularizer=l2(V3_L2_REG),
            recurrent_regularizer=l2(V3_L2_REG)
        )),
        Dropout(V3_DROPOUT),
        Dense(1, activation='sigmoid')
    ], name='V3_LSTM_Extreme')
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=V3_LR),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

model_v3 = build_v3_model()
model_v3.summary()

In [None]:
# Train V3
callbacks_v3 = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]

print("\n" + "="*80)
print("TRAINING V3 (Extreme Regularization)")
print("="*80)

history_v3 = model_v3.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=V3_EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks_v3,
    verbose=1
)

# V3 predictions
y_pred_v3_proba = model_v3.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
y_pred_v3 = (y_pred_v3_proba > 0.5).astype(int).flatten()
mcc_v3 = matthews_corrcoef(y_val, y_pred_v3)

print(f"\nV3 Validation MCC: {mcc_v3:.4f}")

## Ensemble V2 + V3

In [None]:
# Weighted average ensemble
y_pred_ensemble_proba = (V2_WEIGHT * y_pred_v2_proba.flatten() + 
                          V3_WEIGHT * y_pred_v3_proba.flatten())

# Test different thresholds
print("\n" + "="*80)
print("THRESHOLD OPTIMIZATION")
print("="*80)

best_threshold = 0.5
best_mcc = 0

for threshold in np.arange(0.3, 0.7, 0.02):
    y_pred_ensemble = (y_pred_ensemble_proba > threshold).astype(int)
    mcc = matthews_corrcoef(y_val, y_pred_ensemble)
    print(f"Threshold {threshold:.2f}: MCC {mcc:.4f}")
    
    if mcc > best_mcc:
        best_mcc = mcc
        best_threshold = threshold

print("\n" + "="*80)
print(f"BEST THRESHOLD: {best_threshold:.2f} ‚Üí MCC {best_mcc:.4f}")
print("="*80)

In [None]:
# Final ensemble prediction with best threshold
y_pred_ensemble_final = (y_pred_ensemble_proba > best_threshold).astype(int)

print("\n" + "="*80)
print("ENSEMBLE RESULTS")
print("="*80)
print(f"V2 alone: {mcc_v2:.4f}")
print(f"V3 alone: {mcc_v3:.4f}")
print(f"Ensemble: {best_mcc:.4f}")
print("="*80)

if best_mcc > max(mcc_v2, mcc_v3):
    improvement = best_mcc - max(mcc_v2, mcc_v3)
    print(f"‚úÖ MEJORA: +{improvement:.4f} sobre mejor modelo individual")
else:
    print("‚ö†Ô∏è Ensemble no mejora modelos individuales")

print("\n" + classification_report(y_val, y_pred_ensemble_final, target_names=['Not SPAM', 'SPAM']))

# Confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred_ensemble_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title(f'Ensemble V2+V3 (MCC: {best_mcc:.4f})')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()

## Test Predictions

In [None]:
# Load test data
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")

X_test_seq = tokenizer.texts_to_sequences(test['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# V2 test predictions
y_test_v2_proba = model_v2.predict(X_test_pad, batch_size=BATCH_SIZE, verbose=0)

# V3 test predictions
y_test_v3_proba = model_v3.predict(X_test_pad, batch_size=BATCH_SIZE, verbose=0)

# Ensemble test predictions
y_test_ensemble_proba = (V2_WEIGHT * y_test_v2_proba.flatten() + 
                          V3_WEIGHT * y_test_v3_proba.flatten())

y_test_ensemble = (y_test_ensemble_proba > best_threshold).astype(int)

# Create submission
submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_test_ensemble
submission.to_csv('submission.csv', index=False)

print(f"Submission created with {len(y_test_ensemble)} predictions")
print(f"Using ensemble threshold: {best_threshold:.2f}")
print(f"Expected MCC: ~{best_mcc:.4f}")

In [None]:
# Final summary
print("\n" + "="*80)
print("EVOLUCI√ìN COMPLETA - TODAS LAS ITERACIONES")
print("="*80)
print("V1 LSTM baseline:        0.8665")
print("V2 LSTM + L2 moderate:   0.8885 ‚Üê Best individual")
print("V3 LSTM + L2 extreme:    0.8733")
print("V4 DistilBERT:           0.6456 ‚Üê Fracaso total")
print("V5 CNN+LSTM hybrid:      0.8593")
print(f"V6 Ensemble V2+V3:       {best_mcc:.4f}")
print("="*80)

if best_mcc > 0.90:
    print("üéØ OBJETIVO ALCANZADO: MCC > 0.90")
elif best_mcc > 0.8885:
    print(f"‚úÖ MEJORA: +{(best_mcc-0.8885):.4f} sobre V2")
else:
    print("‚ö†Ô∏è Usar V2 individual para submission")
print("="*80)