In [None]:

import torch

# Set CPU as default device
default_device = torch.device("cpu")

# Set GPU for BERT training if available
bert_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Default device for general processing: {default_device}")
print(f"Device for BERT training: {bert_device}")


Default device for general processing: cpu
Device for BERT training: cuda


# 📧 Phishing Email Detection System Using BERT

In this project, we aim to build a phishing email detection model using deep learning techniques, with a focus on the BERT (Bidirectional Encoder Representations from Transformers) architecture.

Phishing emails are deceptive messages designed to trick users into revealing sensitive information. As attackers increasingly use AI to craft convincing emails, traditional rule-based filters fall short. This motivates the need for a more intelligent, language-aware detection system.

We begin by loading and preprocessing real-world phishing and legitimate email datasets. After tokenizing the data, we will train and evaluate a fine-tuned BERT model, and compare its performance to a logistic regression baseline. Our objective is to build a model that accurately classifies emails as "phishing" or "safe" using language patterns and contextual understanding.





In [None]:
from google.colab import userdata
#KaggkeAPIKey = userdata.get('KaggleAPIKey')

---

# Mounting the google drive
We have to mount the google drive seeing as the files for the datasets are stored there

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---
# Cleaning Data across Datasets
Making all data sets consistent in labeling, data type and format:

1. "body": Holds the body of all emails.
2. "urls": Holds the boolean value for if a url is present or not (1: url, 0: no url)
3. "label": Holds the boolen value for if an email is Phishing or Safe (1: phishing, 0 not phishing)


- REMOVING UNPARSABLE/ ILLEGAL DATA
- You can view all data at "APS360_Final_Cleaned_Data" in shared folder

In [None]:
!pip install xlsxwriter
!pip install pandas openpyxl

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.5


In [None]:
import os
import pandas as pd
import re

#Folder with your CSVs
source_folder = '/content/drive/MyDrive/APS360 Notes/Datasets'
output_excel_path = os.path.join(source_folder, 'APS360_Final_Cleaned_Data.xlsx')

#Patterns to detect illegal Excel characters and ANSI sequences
ansi_pattern = re.compile(r'[\x1B\x1b]\[[0-9;]*[A-Za-z]|[0-9]+;[0-9]+[Hf]')
illegal_excel_chars = re.compile(r"[\x00-\x08\x0B-\x1F]")

#Function to check if a row contains illegal characters
def row_has_illegal_data(row):
    return any(
        ansi_pattern.search(str(cell)) or illegal_excel_chars.search(str(cell))
        for cell in row
    )

#Create ExcelWriter object
with pd.ExcelWriter(output_excel_path, engine='openpyxl') as writer:
    for filename in os.listdir(source_folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(source_folder, filename)

            try:
                df = pd.read_csv(filepath, on_bad_lines='skip', encoding='utf-8', engine='python')
            except Exception as e:
                print(f"Skipping {filename} due to read error: {e}")
                continue

            #Drop rows with illegal characters
            df = df[~df.apply(row_has_illegal_data, axis=1)]

            #Clean and rename columns
            df.columns = [col.strip() for col in df.columns]
            col_map = {}
            for col in df.columns:
                if col.lower() in ['email text', 'text']:
                    col_map[col] = 'body'
                elif col.lower() == 'email type':
                    col_map[col] = 'label'
            df = df.rename(columns=col_map)

            #Add 'urls' column if missing
            if 'urls' not in df.columns and 'body' in df.columns:
                df['urls'] = df['body'].astype(str).apply(lambda x: 1 if 'http' in x else 0)

            #Keep only ['body', 'urls', 'label']
            keep_cols = [col for col in ['body', 'urls', 'label'] if col in df.columns]
            df = df[keep_cols]

            #Write sheet to Excel
            sheet_name = os.path.splitext(filename)[0][:31]
            try:
                df.to_excel(writer, sheet_name=sheet_name, index=False)
            except Exception as e:
                print(f"Failed to write sheet for {filename}: {e}")

print(f"Done! Cleaned Excel file saved at:\n{output_excel_path}")

Done! Cleaned Excel file saved at:
/content/drive/MyDrive/APS360 Notes/Datasets/APS360_Final_Cleaned_Data.xlsx


---
#Combine Data into One Large Dataset

- Takes all csv files and merges into one giant data set.
- Removes empty and null rows.
- Randomly shuffles and rearranges data.
- Makes sure that "label" and "urls" data is numerical later processing

In [None]:
#This is a function Force string/int labels to integer 0 or 1
#Will be used later in combination (for cleaning purposes)

def clean_numerics(x):
    x_str = str(x).strip().lower()
    if x_str in ['1', 'phishing email']:
        return 1
    elif x_str in ['0', 'safe email']:
        return 0
    else:
        return 0

In [None]:
#Load all sheets
all_sheets = pd.read_excel(output_excel_path, sheet_name=None)

#Concatenate all sheets into one DataFrame
phishing_df = pd.concat(all_sheets.values(), ignore_index=True)

#Drop rows with missing values (if any)
phishing_df = phishing_df.dropna()

#Shuffle dataset
phishing_df = phishing_df.sample(frac=1, random_state=42).reset_index(drop=True)

#Checks that this data is numerical
phishing_df['label'] = phishing_df['label'].apply(clean_numerics)
phishing_df['urls'] = phishing_df['urls'].apply(clean_numerics)


---
#Split Tensor Data into Training Validation and Testing Datasets

- Randomly split the encoded email data into 70% training, 15% validation, and 15% test sets.
- Each split contains input tensors from the tokenization (input_ids, attention_mask) along with corresponding labels and URL indicators (from phishing_df ).
- This prepares the data for use in training and evaluating an AI classification model.




In [None]:
from sklearn.model_selection import train_test_split
import torch

#Convert labels and urls to tensors
label = torch.tensor(phishing_df['label'].values)
urls = torch.tensor(phishing_df['urls'].values)

#First split into training data for 70% and temp data (vaidation + testing) for 30%
train_idx, temp_idx = train_test_split(range(len(label)), test_size=0.3, random_state=42)

#Then split temp into validation and testing 15% each
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)


In [None]:
# splitting the bodies for train, val, and test data
train_bodies = phishing_df['body'][train_idx].tolist()
val_bodies = phishing_df['body'][val_idx].tolist()
test_bodies = phishing_df['body'][test_idx].tolist()

# converting the training, val, and test urls and labels to tensors
train_urls = torch.tensor(phishing_df['urls'][train_idx].tolist())
train_labels = torch.tensor(phishing_df['label'][train_idx].tolist())


val_urls = torch.tensor(phishing_df['urls'][val_idx].tolist())
val_labels = torch.tensor(phishing_df['label'][val_idx].tolist())


test_urls = torch.tensor(phishing_df['urls'][test_idx].tolist())
test_labels = torch.tensor(phishing_df['label'][test_idx].tolist())

---
# **Tokenize the training, validation, and testing bodies**
We are now tokenizing the data that we have previously split. This tokenizing code has been repurposed from Asmita's code.

In [None]:

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class PhishingEmailDataset(Dataset):
    def __init__(self, texts, urls, labels, tokenizer, max_length=256):
        self.texts = texts
        self.urls = urls
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'urls': self.urls[idx],
            'labels': self.labels[idx]
        }

#Create datasets
train_dataset = PhishingEmailDataset(train_bodies, train_urls, train_labels, tokenizer)
val_dataset = PhishingEmailDataset(val_bodies, val_urls, val_labels, tokenizer)
test_dataset = PhishingEmailDataset(test_bodies, test_urls, test_labels, tokenizer)

#Create DataLoaders
batch_size = 8

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

---
# **Converting the Tokenized Data to Complete Tensors**


# Tokenization for Logisitic Regression

This implementation of tokenization will be used specifically for the logistic regression model. The implementation steps include:
  1. Vectorizing the sentences
  2. Counting the occurances of words
  3. Vectorizing the numbers for the corresponding words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,3))

train_bow = vectorizer.fit_transform(train_bodies)
val_bow = vectorizer.transform(val_bodies)
test_bow = vectorizer.transform(test_bodies)

---
#**Apply Pretrained BERT Model**

- Initialize Pretrained BERT Model Transformer
- Use AdamW Optimizer for optimization and loss
- Define Dataloaders from tensors
- Train the Model

In [None]:
"""Initialize Pretrained BERT Model Transformer"""

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2  # Binary classification
)


import torch

# Set up device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device
model = model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [None]:
"""Use AdamW Optimizer for optimization and loss"""
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-5)


In [None]:
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

epochs=3
# Initialize optimizer WITH weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-5)

# Scheduler setup (optional but recommended for BERT)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# Early stopping vars
best_val_loss = float('inf')
patience = 2  # how many bad epochs before stopping
patience_counter = 0

train_loss_list = []
val_loss_list = []
train_err_list = []
val_err_list = []

epochs = 3
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    total_train_err = 0

    for step, batch in enumerate(train_loader):
        b_input_ids = batch['input_ids'].to(bert_device)
        b_attention_mask = batch['attention_mask'].to(bert_device)
        b_labels = batch['labels'].to(bert_device)

        optimizer.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()  # <- add scheduler step after optimizer step

        total_train_loss += loss.item()

        preds = logits.argmax(dim=1)
        total_train_err += (preds != b_labels).sum().item()

        print(f"Epoch {epoch+1}, Batch {step+1}/{len(train_loader)} completed. Batch loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_error = total_train_err / len(train_loader.dataset)

    train_loss_list.append(avg_train_loss)
    train_err_list.append(train_error)

    # Validation phase
    model.eval()
    total_val_loss = 0
    total_val_err = 0
    with torch.no_grad():
        for batch in val_loader:
            b_input_ids = batch['input_ids'].to(bert_device)
            b_attention_mask = batch['attention_mask'].to(bert_device)
            b_labels = batch['labels'].to(bert_device)

            outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            total_val_loss += loss.item()
            preds = logits.argmax(dim=1)
            total_val_err += (preds != b_labels).sum().item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_error = total_val_err / len(val_loader.dataset)

    val_loss_list.append(avg_val_loss)
    val_err_list.append(val_error)

    print(f"Epoch {epoch+1} done. Train loss: {avg_train_loss:.4f}, Val loss: {avg_val_loss:.4f}, Train err: {train_error:.4f}, Val err: {val_error:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 2, Batch 875/5874 completed. Batch loss: 0.0004
Epoch 2, Batch 876/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 877/5874 completed. Batch loss: 0.0004
Epoch 2, Batch 878/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 879/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 880/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 881/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 882/5874 completed. Batch loss: 0.0005
Epoch 2, Batch 883/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 884/5874 completed. Batch loss: 0.0004
Epoch 2, Batch 885/5874 completed. Batch loss: 0.0006
Epoch 2, Batch 886/5874 completed. Batch loss: 0.0013
Epoch 2, Batch 887/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 888/5874 completed. Batch loss: 0.0003
Epoch 2, Batch 889/5874 completed. Batch loss: 0.1441
Epoch 2, Batch 890/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 891/5874 completed. Batch loss: 0.0002
Epoch 2, Batch 89

In [None]:
epochs_range = range(1, epochs + 1)

plt.figure(figsize=(12,5))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_loss_list, label='Train Loss')
plt.plot(epochs_range, val_loss_list, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over epochs')
plt.legend()

# Plot error
plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_err_list, label='Train Error')
plt.plot(epochs_range, val_err_list, label='Val Error')
plt.xlabel('Epoch')
plt.ylabel('Error rate')
plt.title('Error over epochs')
plt.legend()

plt.show()


# Cleaning up RAM usage **(Place this block at the bottom of the code for now please)**
We are going to clean up some data to prevent high usage of memory. Please note, you can only run this cell once if you don't reinstantiate these variables

In [None]:
import gc

del phishing_df

del train_bodies, val_bodies, test_bodies

del tokenizedTraining, tokenizedValidation, tokenizedTest

gc.collect()