# Email spam classification using Feedforward Neural Networks

---


In this project, our goal is to classify emails as either "spam" or "not spam" also known as ham. We will aim to use a feedforward neural network for Email spam classification is a common problem in natural language processing, where the objective is to automatically detect and filter out unwanted or potentially harmful messages.

Usually I think we would use a more linear model such as a linear regression or Naive Bayes algorithm if we think the relationship between the features and the output is linear, or we think that the dataset is small and we worry about overfitting. But I would like to learn more about neural networks so I will try with a feedforward neural network approach, aiming to see if we can capture any complex interactions from features that may affect the output (for example image or speech recognition).


## Loading the dataset


In [81]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def load_data(path):
    df = pd.read_csv(path)
    return df


df1 = load_data('../datasets/email.csv')
df2 = load_data('../datasets/email_spam2.csv')


df1 = df1.iloc[:-1]
df1 = df1[['Category', 'Message']]  # drop extra cols and keep the cols we want
df2 = df2[['Category', 'Message']]


In [82]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df1['Label'] = le.fit_transform(df1['Category'])

df2['Label'] = le.fit_transform(df2['Category'])
df =pd.concat([df1, df2], ignore_index=True)
print(df['Label'].value_counts())


Label
0    4883
1     773
Name: count, dtype: int64


## Split the data to training, validation, and testing data (70/10/20)


In [83]:
from sklearn.model_selection import train_test_split



# Split into training, validation and testing sets (70/10/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    df['Message'],     # input: raw messages
    df['Label'],       # target: 0 = ham, 1 = spam
    test_size=0.2,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.125,
    random_state=42
)

print("Training set:")
print( X_train.shape)
print( y_train.shape)

print("Validation set:")
print( X_val.shape)
print( y_val.shape)

print("Testing set:")
print(X_test.shape)
print( y_test.shape)








Training set:
(3958,)
(3958,)
Validation set:
(566,)
(566,)
Testing set:
(1132,)
(1132,)


## Vectorisation (Converting word to vectors/numerical values)


In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=8000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_val_vec = vectorizer.transform(X_val)
X_train_vec.shape, X_test_vec.shape, X_val_vec.shape


#convert to dense matrix because pytorch/tensorflow doesn't support sparse matrices
X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()
X_val_dense = X_val_vec.toarray()
X_train_dense.shape,y_train.shape, X_test_dense.shape, X_val_dense.shape

print(f"Training set: {X_train_dense.shape}, {y_train.shape}")    
print(f"Testing set: {X_test_dense.shape}, {y_test.shape}")    
print(f"Validation set: {X_val_dense.shape}, {y_val.shape}")    



Training set: (3958, 7915), (3958,)
Testing set: (1132, 7915), (1132,)
Validation set: (566, 7915), (566,)


# Convert to tensors and batch


In [85]:
#convert to tensors
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # shape (N, 1)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)    # shape (N, 1)
X_val_tensor = torch.tensor(X_val_dense, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)    # shape (N, 1)
from torch.utils.data import TensorDataset, DataLoader
#combine tensors into a dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
print(train_dataset[0])

#batch them up
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
print(len(train_loader))
print(len(test_loader))
print(len(val_loader))
#we can see we've batched it into 70 training batches and 18 testing batches

for X_batch, y_batch in train_loader:
    print(X_batch.shape)
    print(y_batch.shape)  # should now be torch.Size([64, 1])
    break


(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor([0.]))
62
18
9
torch.Size([64, 7915])
torch.Size([64, 1])


Input size: 8000 (Each batch has 64 samples and 8000 features)
First layer: 256 hidden neurons/units/features
Second layer: 128 hidden neurons

Output layer: 1 neuron (since we want to do binary classification). But if we were doing something like letter prediction, we might do 26 neuron output (A-Z)


# Model Architecture FFNN


In [86]:
import torch.nn as nn


class SpamClassifier(nn.Module):


    def __init__(self,n_features,n_hidden=[256,128],n_output=1):
        #initialize the model
        super(SpamClassifier,self).__init__()

        in_features = n_features
        layers = []
        #for each hidden layer, we add a linear layer and a ReLU activation function. 
        #Input layer will have 8000 features
        #1st hidden layer will have 256 features
        #2nd hidden layer will have 128 features

        for hidden_layer in n_hidden:
            layers.append(nn.Linear(in_features,hidden_layer))
            layers.append(nn.ReLU())

            in_features = hidden_layer

        #output layer will have 1 feature
        layers.append(nn.Linear(in_features,n_output))
        #use sigmoid rather than softmax cos binary classification
        layers.append(nn.Sigmoid())


        #create a sequential model
        self.model = nn.Sequential(*layers)
        
    def forward(self,x):
        x = self.model(x)
        return x




# Set up model, optimiser, and loss function


In [87]:
import torch.optim as optim


#the number of features 
input_size = X_batch.shape[1]
hidden_layers = [256,128]
output_size = 1

#create the model
ffnn_model = SpamClassifier(input_size,hidden_layers,output_size).to(device)
import torchsummary
torchsummary.summary(ffnn_model, (input_size,))

#loss function
loss_fn = nn.BCELoss() 

#optimiser
learning_rate = 0.0001
optimiser = optim.AdamW(ffnn_model.parameters(),lr=learning_rate)



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 256]       2,026,496
              ReLU-2                  [-1, 256]               0
            Linear-3                  [-1, 128]          32,896
              ReLU-4                  [-1, 128]               0
            Linear-5                    [-1, 1]             129
           Sigmoid-6                    [-1, 1]               0
Total params: 2,059,521
Trainable params: 2,059,521
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.03
Forward/backward pass size (MB): 0.01
Params size (MB): 7.86
Estimated Total Size (MB): 7.89
----------------------------------------------------------------


# Training Loop


In [88]:
num_epochs = 50
best_val_acc = 0.0
best_model_state = None
for epoch in range(num_epochs):
    ffnn_model.train() #set the model to training mode
    running_loss=0
    correct = 0
    total = 0
    for X_batch,y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        #forward pass
        outputs = ffnn_model(X_batch)
        loss = loss_fn(outputs,y_batch)

        #backward pass
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()


        running_loss += loss.item()
            #calculate accuracy
        predicted = (outputs > 0.5).float()
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
       

    train_accuracy = correct / total
    train_loss = running_loss/len(train_loader)

    ffnn_model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch = X_val_batch.to(device)
            y_val_batch = y_val_batch.to(device)

            val_outputs = ffnn_model(X_val_batch)
            val_loss += loss_fn(val_outputs, y_val_batch).item()

            val_preds = (val_outputs > 0.5).float()
            val_correct += (val_preds == y_val_batch).sum().item()
            val_total += y_val_batch.size(0)

    val_accuracy = val_correct / val_total
    val_loss_avg = val_loss / len(val_loader)

    # Save model if this epoch has the best validation accuracy so far
    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        best_model_state = ffnn_model.state_dict()  #

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy*100:.2f}% | Val Loss: {val_loss_avg:.4f}, Val Acc: {val_accuracy*100:.2f}%")








Epoch 1/50 | Train Loss: 0.6658, Train Acc: 86.18% | Val Loss: 0.6466, Val Acc: 87.81%
Epoch 2/50 | Train Loss: 0.6189, Train Acc: 86.18% | Val Loss: 0.5730, Val Acc: 87.81%
Epoch 3/50 | Train Loss: 0.5115, Train Acc: 86.18% | Val Loss: 0.4275, Val Acc: 87.81%
Epoch 4/50 | Train Loss: 0.3574, Train Acc: 86.31% | Val Loss: 0.2859, Val Acc: 88.69%
Epoch 5/50 | Train Loss: 0.2409, Train Acc: 90.96% | Val Loss: 0.2071, Val Acc: 93.29%
Epoch 6/50 | Train Loss: 0.1754, Train Acc: 94.64% | Val Loss: 0.1652, Val Acc: 95.41%
Epoch 7/50 | Train Loss: 0.1320, Train Acc: 96.61% | Val Loss: 0.1368, Val Acc: 96.47%
Epoch 8/50 | Train Loss: 0.0981, Train Acc: 97.95% | Val Loss: 0.1151, Val Acc: 96.47%
Epoch 9/50 | Train Loss: 0.0722, Train Acc: 98.74% | Val Loss: 0.1014, Val Acc: 97.00%
Epoch 10/50 | Train Loss: 0.0539, Train Acc: 99.19% | Val Loss: 0.0921, Val Acc: 97.17%
Epoch 11/50 | Train Loss: 0.0410, Train Acc: 99.37% | Val Loss: 0.0878, Val Acc: 97.17%
Epoch 12/50 | Train Loss: 0.0322, Train A

# Evaluation


In [89]:
from sklearn.metrics import classification_report, confusion_matrix

# Collect all predictions and labels
y_preds = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = ffnn_model(X_batch)
        predicted = (outputs > 0.5).int()

        y_preds.extend(predicted.cpu().numpy())
        y_true.extend(y_batch.cpu().numpy())

print(classification_report(y_true, y_preds, target_names=['Ham', 'Spam']))


              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       975
        Spam       0.98      0.90      0.94       157

    accuracy                           0.98      1132
   macro avg       0.98      0.95      0.97      1132
weighted avg       0.98      0.98      0.98      1132



# Save the model

In [90]:
import torch
import pickle
import os

# Ensure the weights directory exists
os.makedirs('../weights', exist_ok=True)

# Save the model weights to ../weights
torch.save(ffnn_model.state_dict(), '../weights/ffnn_model_v3.pth')
with open('../vectorisers/tfidf_vectoriser_v3.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
