In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import shap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Flatten, Dropout

# Visualización de Resultados
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv('data/combined_data.csv')

data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [19]:
X = data['text'].astype(str)  
y = data['label']

In [20]:
# Configura el Tokenizer y convierte los textos a secuencias
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [21]:
# Aplica padding a las secuencias
X_padded = pad_sequences(sequences, maxlen=5000)

# Dividir los datos en conjuntos de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.25, random_state=42)

In [22]:
train_sequences = X_train 
size = len(tokenizer.word_index) + 1
seq_len = max([len(seq) for seq in train_sequences])

In [23]:
model = Sequential()
model.add(Embedding(input_dim=size, output_dim=50, input_length=seq_len))
model.add(SimpleRNN(64))  # Utilizando SimpleRNN en lugar de LSTM
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Resumen del modelo
model.summary()



In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.2)

Epoch 1/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1382s[0m 4s/step - accuracy: 0.8718 - loss: 0.2921 - val_accuracy: 0.9622 - val_loss: 0.1162
Epoch 2/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1280s[0m 3s/step - accuracy: 0.9842 - loss: 0.0526 - val_accuracy: 0.9806 - val_loss: 0.0638
Epoch 3/10
[1m 85/392[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m16:09[0m 3s/step - accuracy: 0.9933 - loss: 0.0219

In [None]:
# Evaluación del modelo
results = model.evaluate(X_test, y_test)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

In [None]:
# Gráfico de la pérdida
plt.plot(history.history['loss'], 'bo', label='Training loss')
plt.plot(history.history['val_loss'], 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Gráfico de la pérdida
plt.plot(history.history['loss'], 'bo', label='Training loss')
plt.plot(history.history['val_loss'], 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

## Variables importantes

In [None]:
# Determina el tamaño del subconjunto como un porcentaje del total (e.g., 10%)
subset_size = int(0.01 * len(X_val))  # Cambia 0.1 a la fracción deseada

# Genera índices aleatorios
indices = np.random.choice(len(X_val), subset_size, replace=False)

# Selecciona el subconjunto usando los índices
X_val_subset = X_val[indices]

In [None]:
explainer = shap.Explainer(model.predict, X_val_subset)

shap_values = explainer(X_val_subset, max_evals=5857)

shap.plots.waterfall(shap_values[0])

## Cluster

In [None]:
# Filtra los ejemplos que son spam
spam_data = X_val[y_val == 1]

# Reducción de dimensionalidad
pca = PCA(n_components=2)
spam_reduced = pca.fit_transform(spam_data)

# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(spam_reduced)

In [None]:
plt.scatter(spam_reduced[:, 0], spam_reduced[:, 1], c=labels, cmap='viridis', marker='o')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75)
plt.xlabel('Componente principal 1')
plt.ylabel('Componente principal 2')
plt.title('Clustering de Correos Spam')
plt.show()