In [None]:
# Resultado (registro): MCC público 0.87334 | MCC privado 0.86075
# Este notebook es la versión final (V6) que usé para la submission.

In [None]:
# Configuración mínima del entorno para evitar logs excesivos de TF
import os
# Prefiero la implementación en python de protobuf para mensajes más limpios en consola
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# Solo mostrar errores (0=all,1=info,2=warning,3=error) -> 2 mantiene la salida más limpia
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Desactivo optimizaciones que suelen generar warnings en algunas instalaciones
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
# Librerías básicas
import pandas as pd  # manejo de CSV/DFs
import numpy as np  # operaciones numéricas
import matplotlib.pyplot as plt  # visualización
import seaborn as sns
import warnings
# Mantengo warnings silenciosos para que las salidas sean legibles en el cuaderno
warnings.filterwarnings('ignore')

# Semilla para reproducibilidad
seed = 42
np.random.seed(seed)

# TensorFlow / Keras (solo lo que uso en el notebook V6)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Semilla para TF
tf.random.set_seed(seed)
keras.utils.set_random_seed(seed)

# sklearn: métricas y utilidades
from sklearn.metrics import matthews_corrcoef, classification_report
from sklearn.model_selection import train_test_split

# Pandas display
pd.set_option('display.max_rows', 36)
pd.set_option('display.max_colwidth', 150)

In [None]:
# Hiperparámetros elegidos tras las iteraciones
MAX_WORDS = 10000  # vocabulario limitado a 10k palabras más frecuentes
MAX_LEN = 200  # longitud fija de secuencias
EMBEDDING_DIM = 100

LSTM_UNITS = 64
DENSE_UNITS = 32

SPATIAL_DROPOUT = 0.4
DROPOUT_RATE = 0.7
L2_REG = 6e-4

BATCH_SIZE = 32
EPOCHS = 50
VALIDATION_SPLIT = 0.2
LEARNING_RATE = 5e-4
CLIPNORM = 1.0

In [None]:
# Cargo los datos de entrenamiento (ruta Kaggle usada en experiments)
train = pd.read_csv('/kaggle/input/u-tad-spam-not-spam-2025-edition/train.csv', index_col='row_id')

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text'])
X_train_seq = tokenizer.texts_to_sequences(train['text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = train['spam_label'].values
# División train/validation con stratify para preservar balance de clases
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_pad, y_train, test_size=VALIDATION_SPLIT, random_state=seed, stratify=y_train)

def build_v6_model():
    inputs = Input(shape=(MAX_LEN,), name='input_sequences')
    x = Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN, name='embedding')(inputs)
    x = SpatialDropout1D(SPATIAL_DROPOUT, name='spatial_dropout')(x)
    lstm_out = Bidirectional(LSTM(LSTM_UNITS, kernel_regularizer=l2(L2_REG), recurrent_regularizer=l2(L2_REG), bias_regularizer=l2(L2_REG), return_sequences=False), name='bidirectional_lstm')(x)
    dense = Dense(DENSE_UNITS, activation='relu', kernel_regularizer=l2(L2_REG), bias_regularizer=l2(L2_REG), name='dense_classifier')(lstm_out)
    dense = Dropout(DROPOUT_RATE, name='dropout')(dense)
    outputs = Dense(1, activation='sigmoid', name='output')(dense)
    model = Model(inputs=inputs, outputs=outputs, name='V6_LSTM_Final')
    return model

# Instancio y compilo el modelo
model = build_v6_model()
optimizer = keras.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4, clipnorm=CLIPNORM)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc')])
model.summary()

In [None]:
# Callbacks: EarlyStopping, ModelCheckpoint y ReduceLROnPlateau
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_spam_model_v6.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6, verbose=1)
]

history = model.fit(X_train_final, y_train_final, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val), callbacks=callbacks, verbose=1)