In [None]:
%pip install pandas numpy matplotlib tensorflow nltk

In [1]:
# This is a study notebook
import pandas as pd
SEED = 42
df = pd.read_csv("Emails.csv") # dowloaded from -< https://media.geeksforgeeks.org/wp-content/uploads/20250320162008521713/spam_ham_dataset.csv
df.head()
df.info()
df["label"].value_counts(normalize=True)

spam_df = df[df["label"] == "spam"]
ham_df = df[df["label"] == "ham"]

# downsample ham to size of spam
ham_sampled = ham_df.sample(n=len(spam_df),random_state=SEED)

balanced_df = pd.concat([spam_df, ham_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df["label"].value_counts(normalize=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


label
spam    0.5
ham     0.5
Name: proportion, dtype: float64

In [3]:
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords') # like "the" "a" "for"
stop_words = set(stopwords.words('english'))

def preprocess_text(text: str) -> str:
    # lower
    text = text.lower()
    # remove "subject:" prefix if present
    if text.startswith("subject:"):
        text = text[len("subject:"):]
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # tokenize by spaces
    tokens = text.split()
    # remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

balanced_df["clean_text"] = balanced_df["text"].astype(str).apply(preprocess_text)
balanced_df[["text", "clean_text"]].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\helia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,clean_text
0,Subject: affordable lev ! itra & v ( iagra ! o...,affordable lev itra v iagra overnight delivery
1,Subject: urgent news alert ! ( otcbb : gspm ) ...,urgent news alert otcbb gspm gold hot stock pr...
2,Subject: quality medications available with di...,quality medications available discounts prices...
3,Subject: 98 - 0432\r\ncan you please extend si...,98 0432 please extend sitara deal 156657 3 1 0...
4,Subject: we have all your favorite programs at...,favorite programs incredibly low prices window...


In [4]:
from sklearn.model_selection import train_test_split

X = balanced_df["clean_text"].values
y = balanced_df["label"].map({"ham": 0, "spam": 1}).values  # binary labels

# train+val vs test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000   # vocabulary size
max_len = 100       # sequence length (number of tokens per email)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)  # fit only on training data

def texts_to_padded(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")

X_train_seq = texts_to_padded(X_train)
X_test_seq  = texts_to_padded(X_test)

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embedding_dim = 32

model = models.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    layers.LSTM(16),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")  # probability of spam
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",       
    metrics=["accuracy"]
)

model.summary()




In [9]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-5
)

history = model.fit(
    X_train_seq, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_seq, y_test),
    callbacks=[early_stop, reduce_lr],
    verbose=2
)


Epoch 1/20
75/75 - 3s - 44ms/step - accuracy: 0.5321 - loss: 0.6912 - val_accuracy: 0.5717 - val_loss: 0.6824 - learning_rate: 0.0010
Epoch 2/20
75/75 - 1s - 19ms/step - accuracy: 0.8532 - loss: 0.3944 - val_accuracy: 0.9200 - val_loss: 0.2879 - learning_rate: 0.0010
Epoch 3/20
75/75 - 1s - 19ms/step - accuracy: 0.9525 - loss: 0.1957 - val_accuracy: 0.9433 - val_loss: 0.2237 - learning_rate: 0.0010
Epoch 4/20
75/75 - 1s - 19ms/step - accuracy: 0.9666 - loss: 0.1475 - val_accuracy: 0.9383 - val_loss: 0.2459 - learning_rate: 0.0010
Epoch 5/20
75/75 - 1s - 19ms/step - accuracy: 0.9704 - loss: 0.1316 - val_accuracy: 0.9467 - val_loss: 0.2152 - learning_rate: 0.0010
Epoch 6/20
75/75 - 1s - 19ms/step - accuracy: 0.9771 - loss: 0.1057 - val_accuracy: 0.9533 - val_loss: 0.1989 - learning_rate: 0.0010
Epoch 7/20
75/75 - 1s - 20ms/step - accuracy: 0.9779 - loss: 0.1040 - val_accuracy: 0.9433 - val_loss: 0.2113 - learning_rate: 0.0010
Epoch 8/20
75/75 - 1s - 19ms/step - accuracy: 0.9800 - loss: 0

In [10]:
test_loss, test_accuracy = model.evaluate(X_test_seq, y_test)
print('Test Loss :',test_loss)
print('Test Accuracy :',test_accuracy)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9683 - loss: 0.1462
Test Loss : 0.14618441462516785
Test Accuracy : 0.9683333039283752


In [11]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix


y_test_pred_prob = model.predict(X_test_seq).ravel()
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)

print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, target_names=["ham", "spam"]))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[[291   9]
 [ 10 290]]
              precision    recall  f1-score   support

         ham       0.97      0.97      0.97       300
        spam       0.97      0.97      0.97       300

    accuracy                           0.97       600
   macro avg       0.97      0.97      0.97       600
weighted avg       0.97      0.97      0.97       600



In [12]:
def predict_email(text):
    # preprocess and convert to a batch of one
    clean_text = preprocess_text(text)
    seq = texts_to_padded([clean_text])  
    prob_spam = model.predict(seq)[0, 0]
    return float(prob_spam)

example = "Congratulations! You have won a $1000 Walmart gift card. Click here to claim the prize now."
print(predict_email(example))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
0.9724323749542236
