In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = 'Spam Email Detection.xlsx'
data = pd.read_excel(file_path, sheet_name='spam')

# Preprocess the data
df = data[['v1', 'v2']].dropna()
texts = df['v2'].values
labels = df['v1'].values

# Encode the labels (spam=1, ham=0)
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [3]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the text
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts_train)
word_index = tokenizer.word_index


In [6]:
train_sequences = tokenizer.texts_to_sequences(texts_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

texts_test = [str(text) for text in texts_test]
test_sequences = tokenizer.texts_to_sequences(texts_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [8]:
num_epochs = 10
history = model.fit(train_padded, labels_train, epochs=num_epochs, validation_data=(test_padded, labels_test), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(test_padded, labels_test, verbose=2)


Epoch 1/10
140/140 - 2s - loss: 0.4762 - accuracy: 0.8661 - val_loss: 0.3612 - val_accuracy: 0.8655 - 2s/epoch - 16ms/step
Epoch 2/10
140/140 - 1s - loss: 0.3438 - accuracy: 0.8661 - val_loss: 0.3320 - val_accuracy: 0.8655 - 877ms/epoch - 6ms/step
Epoch 3/10
140/140 - 1s - loss: 0.2991 - accuracy: 0.8661 - val_loss: 0.2738 - val_accuracy: 0.8655 - 670ms/epoch - 5ms/step
Epoch 4/10
140/140 - 1s - loss: 0.2028 - accuracy: 0.9031 - val_loss: 0.1676 - val_accuracy: 0.9480 - 660ms/epoch - 5ms/step
Epoch 5/10
140/140 - 1s - loss: 0.1115 - accuracy: 0.9699 - val_loss: 0.1083 - val_accuracy: 0.9695 - 657ms/epoch - 5ms/step
Epoch 6/10
140/140 - 1s - loss: 0.0661 - accuracy: 0.9821 - val_loss: 0.0832 - val_accuracy: 0.9740 - 645ms/epoch - 5ms/step
Epoch 7/10
140/140 - 1s - loss: 0.0465 - accuracy: 0.9872 - val_loss: 0.0738 - val_accuracy: 0.9767 - 652ms/epoch - 5ms/step
Epoch 8/10
140/140 - 1s - loss: 0.0356 - accuracy: 0.9895 - val_loss: 0.0665 - val_accuracy: 0.9794 - 667ms/epoch - 5ms/step
Ep

In [9]:
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.06155131384730339
Accuracy: 0.9820627570152283
