# Email spam classification using Feedforward Neural Networks

---


In this project, our goal is to classify emails as either "spam" or "not spam" also known as ham. We will aim to use a feedforward neural network for Email spam classification is a common problem in natural language processing, where the objective is to automatically detect and filter out unwanted or potentially harmful messages.

Usually I think we would use a more linear model such as a linear regression or Naive Bayes algorithm if we think the relationship between the features and the output is linear, or we think that the dataset is small and we worry about overfitting. But I would like to learn more about neural networks so I will try with a feedforward neural network approach, aiming to see if we can capture any complex interactions from features that may affect the output (for example image or speech recognition).


## Loading the dataset


In [4]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def load_data(path):
    df = pd.read_csv(path)
    return df


df1 = load_data('../datasets/email.csv')
df2 = load_data('../datasets/email_spam2.csv')


df1 = df1.iloc[:-1]
df1 = df1[['Category', 'Message']]  # drop extra cols and keep the cols we want
df2 = df2[['Category', 'Message']]








In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df1['Label'] = le.fit_transform(df1['Category'])

df2['Label'] = le.fit_transform(df2['Category'])
df =pd.concat([df1, df2], ignore_index=True)
print(df['Label'].value_counts())


Label
0    4883
1     773
Name: count, dtype: int64


## Split the data to training and testing data (80/20)


In [5]:
from sklearn.model_selection import train_test_split

# Split into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    df['Message'],     # input: raw messages
    df['Label'],       # target: 0 = ham, 1 = spam
    test_size=0.2,
    random_state=42
)

print("Training set:")
print(X_train[:3], X_train.shape)
print(y_train[:3], y_train.shape)

print("Testing set:")
print(X_test[:3], X_test.shape)
print(y_test[:3], y_test.shape)








Training set:
5028    PRIVATE! Your 2003 Account Statement for shows...
2767    Married local women looking for discreet actio...
1620              Friends that u can stay on fb chat with
Name: Message, dtype: object (4524,)
5028    1
2767    1
1620    0
Name: Label, dtype: int64 (4524,)
Testing set:
3948                 Sorry, went to bed early, nightnight
5097    Sorry about that this is my mates phone and i ...
3689                           I'll meet you in the lobby
Name: Message, dtype: object (1132,)
3948    0
5097    0
3689    0
Name: Label, dtype: int64 (1132,)


Here, we see that X is the messages (which will be converted to vectorised forms), and y is the label value (1s and 0s)


## Vectorisation (Converting word to vectors/numerical values)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_vec.shape, X_test_vec.shape


#convert to dense matrix because pytorch/tensorflow doesn't support sparse matrices
X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()
X_train_dense.shape,y_train.shape

print(X_train_dense.shape,y_train.shape)    







(4524, 3000) (4524,)


# Convert to tensors and batch


In [7]:
#convert to tensors
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # shape (N, 1)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)    # shape (N, 1)

from torch.utils.data import TensorDataset, DataLoader
#combine tensors into a dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
print(train_dataset[0])

#batch them up
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(test_loader))

#we can see we've batched it into 70 training batches and 18 testing batches

for X_batch, y_batch in train_loader:
    print(X_batch.shape)
    print(y_batch.shape)  # should now be torch.Size([64, 1])
    break


(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor([1.]))
71
18
torch.Size([64, 3000])
torch.Size([64, 1])


Input size: 3000 (Each batch has 64 samples and 3000 features)
First layer: 128 hidden neurons/units/features
Second layer: 64 hidden neurons

Output layer: 1 neuron (since we want to do binary classification). But if we were doing something like letter prediction, we might do 26 neuron output (A-Z)


# Model Architecture FFNN


In [8]:
import torch.nn as nn


class SpamClassifier(nn.Module):


    #input = n_features (3000)
    #hidden layers = n_hidden
    #output = n_output
    def __init__(self,n_features,n_hidden=[128,64],n_output=1):
        #initialize the model
        super(SpamClassifier,self).__init__()

        in_features = n_features
        layers = []

        #for each hidden layer, we add a linear layer and a ReLU activation function. 
        #Input layer will have 3000 features
        #1st hidden layer will have 128 features
        #2nd hidden layer will have 64 features

        for hidden_layer in n_hidden:
            layers.append(nn.Linear(in_features,hidden_layer))
            layers.append(nn.ReLU())
            in_features = hidden_layer

        #output layer will have 1 feature
        layers.append(nn.Linear(in_features,n_output))
        #use sigmoid rather than softmax cos binary classification
        layers.append(nn.Sigmoid())


        #create a sequential model
        self.model = nn.Sequential(*layers)
        
    def forward(self,x):
        x = self.model(x)
        return x




# Set up model, optimiser, and loss function


In [9]:
import torch.optim as optim


#the number of features 
input_size = X_batch.shape[1]
hidden_layers = [128,64]
output_size = 1

#create the model
ffnn_model = SpamClassifier(input_size,hidden_layers,output_size).to(device)
import torchsummary
torchsummary.summary(ffnn_model, (input_size,))

#loss function
loss_fn = nn.BCELoss() 

#optimiser
learning_rate = 0.001
optimiser = optim.AdamW(ffnn_model.parameters(),lr=learning_rate)



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 128]         384,128
              ReLU-2                  [-1, 128]               0
            Linear-3                   [-1, 64]           8,256
              ReLU-4                   [-1, 64]               0
            Linear-5                    [-1, 1]              65
           Sigmoid-6                    [-1, 1]               0
Total params: 392,449
Trainable params: 392,449
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 1.50
Estimated Total Size (MB): 1.51
----------------------------------------------------------------


# Training Loop


In [10]:
num_epochs = 50
correct = 0
total = 0
for epoch in range(num_epochs):
    ffnn_model.train() #set the model to training mode
    running_loss=0

    for X_batch,y_batch in train_loader:
        #using CUDA
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        #forward pass
        outputs = ffnn_model(X_batch)
        loss = loss_fn(outputs,y_batch)


        #calculate accuracy
        predicted = (outputs > 0.5).float()
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

        #backward pass
        #the goal here is to update the weights of the model by calculating the gradient of the loss function with respect to the weights

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        running_loss += loss.item()

    train_accuracy = correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {train_accuracy*100:.2f}%")








Epoch 1/50, Loss: 0.4808, Accuracy: 86.38%
Epoch 2/50, Loss: 0.1475, Accuracy: 90.42%
Epoch 3/50, Loss: 0.0535, Accuracy: 93.22%
Epoch 4/50, Loss: 0.0229, Accuracy: 94.76%
Epoch 5/50, Loss: 0.0137, Accuracy: 95.73%
Epoch 6/50, Loss: 0.0099, Accuracy: 96.41%
Epoch 7/50, Loss: 0.0083, Accuracy: 96.90%
Epoch 8/50, Loss: 0.0074, Accuracy: 97.26%
Epoch 9/50, Loss: 0.0067, Accuracy: 97.54%
Epoch 10/50, Loss: 0.0063, Accuracy: 97.77%
Epoch 11/50, Loss: 0.0058, Accuracy: 97.96%
Epoch 12/50, Loss: 0.0057, Accuracy: 98.12%
Epoch 13/50, Loss: 0.0055, Accuracy: 98.25%
Epoch 14/50, Loss: 0.0053, Accuracy: 98.36%
Epoch 15/50, Loss: 0.0048, Accuracy: 98.46%
Epoch 16/50, Loss: 0.0046, Accuracy: 98.55%
Epoch 17/50, Loss: 0.0046, Accuracy: 98.63%
Epoch 18/50, Loss: 0.0048, Accuracy: 98.70%
Epoch 19/50, Loss: 0.0043, Accuracy: 98.76%
Epoch 20/50, Loss: 0.0042, Accuracy: 98.81%
Epoch 21/50, Loss: 0.0042, Accuracy: 98.87%
Epoch 22/50, Loss: 0.0040, Accuracy: 98.91%
Epoch 23/50, Loss: 0.0039, Accuracy: 98.9

# Evaluation
evaluate test dataset

In [11]:
ffnn_model.eval()  # set to evaluation mode
test_correct = 0
test_total = 0

with torch.no_grad():  
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = ffnn_model(X_batch)
        predicted = (outputs > 0.5).float()
        test_correct += (predicted == y_batch).sum().item()
        test_total += y_batch.size(0)

test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 97.88%


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

# Collect all predictions and labels
y_preds = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = ffnn_model(X_batch)
        predicted = (outputs > 0.5).float()

        y_preds.extend(predicted.cpu().numpy())
        y_true.extend(y_batch.cpu().numpy())

print(classification_report(y_true, y_preds, target_names=['Ham', 'Spam']))


              precision    recall  f1-score   support

         Ham       0.98      0.99      0.99       975
        Spam       0.95      0.90      0.92       157

    accuracy                           0.98      1132
   macro avg       0.97      0.94      0.95      1132
weighted avg       0.98      0.98      0.98      1132



# Save the model

In [None]:
import torch
import pickle
import os

# Ensure the weights directory exists
os.makedirs('../weights', exist_ok=True)

# Save the model weights to ../weights
torch.save(ffnn_model.state_dict(), '../weights/ffnn_model_v2.pth')
with open('../vectorisers/tfidf_vectoriser_v2.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
