In [None]:
#====================================================================================================#
#      Competición - SPAM/NOT SPAM - ITERACIÓN 7                                                    #
#      V7: FEATURE ENGINEERING + LSTM HÍBRIDO                                                       #
#      Basado en análisis del dataset                                                               #
#                                                                                                    #
#      HALLAZGOS DEL ANÁLISIS:                                                                      #
#      - URLs en SPAM: 19.6% vs 1.7% Not SPAM (+17.9%)                                             #
#      - Money words en SPAM: 42.3% vs 18.9% (+23.4%)                                              #
#      - Palabras clave: nbsp, width, font, pills, viagra, prescription                            #
#      - Problema: 1201 textos Not SPAM muy cortos confunden al modelo                             #
#                                                                                                    #
#      ESTRATEGIA V7:                                                                               #
#      ✅ LSTM base V3 (funciona bien)                                                              #
#      ✅ + Features manuales concatenadas (URLs, money, HTML, pharma)                              #
#      ✅ + Class weights para imbalance                                                            #
#====================================================================================================#

In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

seed = 42
np.random.seed(seed)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Bidirectional, Dense, Dropout,
    SpatialDropout1D, Concatenate, BatchNormalization
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)

from sklearn.metrics import matthews_corrcoef, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Hyperparameters - V7 = V3 base + features
MAX_WORDS = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100

# V3 exacto
LSTM_UNITS = 64
DENSE_UNITS = 32
SPATIAL_DROPOUT = 0.4
DROPOUT_RATE = 0.7
L2_REG = 5e-4

# Training
BATCH_SIZE = 32
EPOCHS = 50
VALIDATION_SPLIT = 0.2
LEARNING_RATE = 5e-4
CLIPNORM = 1.0

# Número de features manuales
N_FEATURES = 12

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv", index_col="row_id")
test = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/test.csv", index_col="row_id")

print(f"Train: {len(train)} | Test: {len(test)}")
print(f"Class balance: {train['spam_label'].value_counts(normalize=True).to_dict()}")

In [None]:
# FEATURE ENGINEERING - Basado en análisis del dataset
def extract_features(df):
    features = pd.DataFrame()
    
    # Convertir texto a minúsculas para los counts
    text_lower = df['text'].str.lower()
    
    # 1. Longitud del texto
    features['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    features['char_count'] = df['text'].apply(lambda x: len(str(x)))
    
    # 2. URLs (SPAM: 19.6% vs Not SPAM: 1.7%)
    features['has_url'] = text_lower.str.contains(r'http|www\.|href', regex=True).fillna(False).astype(int)
    features['url_count'] = text_lower.str.count(r'http|www\.').fillna(0)
    
    # 3. HTML tags (SPAM tiene mucho HTML: nbsp, width, font, border)
    features['has_html'] = text_lower.str.contains(r'<[^>]+>|nbsp|width=|font|border|valign', regex=True).fillna(False).astype(int)
    
    # 4. Money/Free words (SPAM: 42.3% vs 18.9%)
    features['money_words'] = text_lower.str.count(r'\$|free|money|cash|win|prize|offer|discount|sale|cheap').fillna(0)
    
    # 5. Pharma words (viagra, cialis, pills, prescription)
    features['pharma_words'] = text_lower.str.count(r'viagra|cialis|pills?|prescription|pharmacy|medication|drug').fillna(0)
    
    # 6. Urgency words
    features['urgent_words'] = text_lower.str.count(r'urgent|immediately|hurry|limited|act now').fillna(0)
    
    # 7. Subject line (más común en SPAM: 51.1% vs 40.4%)
    features['has_subject'] = text_lower.str.startswith('subject:').fillna(False).astype(int)
    
    # 8. Ratio mayúsculas
    features['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1))
    
    # 9. Caracteres especiales (spam tiende a tener más)
    features['special_chars'] = df['text'].apply(lambda x: sum(1 for c in str(x) if not c.isalnum() and not c.isspace()) / max(len(str(x)), 1))
    
    # 10. Texto muy corto (difícil de clasificar)
    features['is_short'] = (features['word_count'] < 10).astype(int)
    
    return features

print("Extrayendo features...")
train_features = extract_features(train)
test_features = extract_features(test)

print(f"Features extraídas: {train_features.shape[1]}")
print(train_features.describe())

In [None]:
# Normalizar features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

print(f"Features shape: {train_features_scaled.shape}")

In [None]:
# Tokenization (igual que V3)
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])

X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values

X_test_seq = tokenizer.texts_to_sequences(test['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# Train/val split (mantener misma división para features y secuencias)
X_train_text, X_val_text, X_train_feat, X_val_feat, y_train_final, y_val = train_test_split(
    X_train_pad, train_features_scaled, y_train, 
    test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train
)

print(f"Train text shape: {X_train_text.shape}")
print(f"Train features shape: {X_train_feat.shape}")
print(f"Val text shape: {X_val_text.shape}")

In [None]:
# Class weights para imbalance
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_final),
    y=y_train_final
)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")

In [None]:
def build_v7_hybrid_model():
    # Input 1: Secuencias de texto
    text_input = Input(shape=(MAX_LEN,), name='text_input')
    
    # Rama LSTM (V3 exacto)
    x = Embedding(
        input_dim=MAX_WORDS,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LEN
    )(text_input)
    
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)
    
    lstm_out = Bidirectional(
        LSTM(
            LSTM_UNITS,
            kernel_regularizer=l2(L2_REG),
            recurrent_regularizer=l2(L2_REG),
            bias_regularizer=l2(L2_REG)
        )
    )(x)
    
    # Input 2: Features manuales
    features_input = Input(shape=(train_features_scaled.shape[1],), name='features_input')
    
    # Procesar features
    feat_dense = Dense(16, activation='relu', kernel_regularizer=l2(L2_REG))(features_input)
    feat_dense = BatchNormalization()(feat_dense)
    feat_dense = Dropout(0.3)(feat_dense)
    
    # CONCATENAR LSTM + Features
    combined = Concatenate()([lstm_out, feat_dense])
    
    # Classifier final
    combined = Dense(
        DENSE_UNITS,
        activation='relu',
        kernel_regularizer=l2(L2_REG)
    )(combined)
    
    combined = Dropout(DROPOUT_RATE)(combined)
    
    outputs = Dense(1, activation='sigmoid')(combined)
    
    model = Model(
        inputs=[text_input, features_input],
        outputs=outputs,
        name='V7_Hybrid_LSTM_Features'
    )
    return model

model = build_v7_hybrid_model()

optimizer = keras.optimizers.AdamW(
    learning_rate=LEARNING_RATE,
    weight_decay=1e-4,
    clipnorm=CLIPNORM
)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()]
)

model.summary()

In [None]:
# Callbacks - V3 exacto
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,  # Un poco más de paciencia para las features
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        'best_spam_model_v7.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

history = model.fit(
    [X_train_text, X_train_feat], y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=([X_val_text, X_val_feat], y_val),
    callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

In [None]:
# Evaluation
y_pred_proba = model.predict([X_val_text, X_val_feat], batch_size=BATCH_SIZE, verbose=0).flatten()

best_threshold = 0.5
y_pred = (y_pred_proba > best_threshold).astype(int)
mcc_val = matthews_corrcoef(y_val, y_pred)

# Debug
final_epoch = len(history.history['loss'])
train_loss_final = history.history['loss'][-1]
val_loss_final = history.history['val_loss'][-1]
overfitting_delta = val_loss_final - train_loss_final

spam_probs = y_pred_proba[y_val == 1]
notspam_probs = y_pred_proba[y_val == 0]
separation = spam_probs.mean() - notspam_probs.mean()

print("="*80)
print("V7 - HYBRID LSTM + FEATURES RESULTS")
print("="*80)
print(f"Val MCC: {mcc_val:.4f}")
print(f"Epochs: {final_epoch}")
print(f"Train Loss: {train_loss_final:.4f} | Val Loss: {val_loss_final:.4f}")
print(f"Overfitting Δ: {overfitting_delta:.4f}")
print(f"Class Separation: {separation:.4f}")
print("="*80)
print(classification_report(y_val, y_pred, target_names=['Not SPAM', 'SPAM']))
print("="*80)
print(f"\nCOMPARACIÓN:")
print(f"V3 (baseline):  0.8849 val / 0.87 test")
print(f"V7 (hybrid):    {mcc_val:.4f} val")

In [None]:
# Test predictions
y_test_proba = model.predict([X_test_pad, test_features_scaled], batch_size=BATCH_SIZE, verbose=0).flatten()
y_test_pred = (y_test_proba > best_threshold).astype(int)

submission = pd.read_csv("/kaggle/input/u-tad-spam-not-spam-2025-edition/sample_submission.csv")
submission["spam_label"] = y_test_pred
submission.to_csv('submission.csv', index=False)

print(f"\nSubmission: {len(y_test_pred)} predictions")
print(f"SPAM: {y_test_pred.sum()} ({y_test_pred.mean()*100:.1f}%)")
print(f"Not SPAM: {len(y_test_pred) - y_test_pred.sum()} ({(1-y_test_pred.mean())*100:.1f}%)")