In [21]:
!pip install nltk



In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
class NanoTransformer(nn.Module):
    """
        This class implements a simplified Transformer model for sequence classification.
        It uses an embedding layer for tokens, learned positional embeddings,
        a Transformer, and a Linear layer.

        num_emb: The number of unique tokens in the vocabulary. (vocab_size)
        output_size: The size of the output layer (number of classes). (2)
        hidden_size: The dimension of the hidden layer in the Transformer block (default: 128)
        num_heads: The number of heads in the multi-head attention layer (default: 4).
    """
    def __init__(self, num_emb, output_size, hidden_size=128, num_heads=4, max_seq_length=256):
        super(NanoTransformer, self).__init__()

        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, hidden_size)  # (vocab_size, 128)

        # Learned positional embeddings
        self.pos_embedding = nn.Embedding(max_seq_length, hidden_size)  # (max_seq_length, 128)

        # Multi-head attention
        self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads=num_heads, batch_first=True)

        # Feed-forward network
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size)
        )

        self.fc_out = nn.Linear(hidden_size, output_size)  # (batch_size, 128) -> (batch_size, 2)

    def forward(self, input_seq):
        # (B, 256) 
        batch_size, seq_length = input_seq.shape  # (32, 160)
        # (1, 6, 777, 111 ,... ) # 0 - LENGTH OF THE VOCABULARY DICTIONARY
        input_embs = self.embedding(input_seq)  # (32, 256, 1) -> (32, 256, 128)

        # Create positional indices
        pos_indices = torch.arange(seq_length, device=input_seq.device)  # (128)

        pos_embs = self.pos_embedding(pos_indices).unsqueeze(0).expand(batch_size, seq_length, -1)  # (1, 160, 128) -> (32, 160, 128)

        embs = input_embs + pos_embs  # (32, 160, 128) + (32, 160, 128)

        output, attn_map = self.multihead_attn(embs, embs, embs)  # (32, 160, 128)
        # print(output.shape)
        output = self.mlp(output)  # (32, 256, 128) @ (128, 2) (32, 160, 2)
        return self.fc_out(output)  # (32, 160, 2)

    
class EmailDatasets():
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [24]:
data = pd.read_csv("/kaggle/input/phishing-emails/cleaned_dataset.csv", encoding='latin1', on_bad_lines='skip')

In [25]:
data

Unnamed: 0.1,Unnamed: 0,Email_Text,Email_Type
0,0,": 6 . 1100 , disc : uniformitarianism , : 1086...",Safe Email
1,1,side * galicismos * * galicismo * spanish term...,Safe Email
2,2,: equistar deal tickets still available assist...,Safe Email
3,3,"Hello hot lil horny toy. one dream About, open...",Phishing Email
4,4,software incredibly low prices ( 86 % lower ) ...,Phishing Email
...,...,...,...
18597,18646,date lonely housewife always wanted date lonel...,Phishing Email
18598,18647,request submitted : access request anita . dup...,Safe Email
18599,18648,": important - prc mtg hi dorn & john , discove...",Safe Email
18600,18649,press clippings - letter californian utilities...,Safe Email


In [26]:
# Find the minimum count between the two classes
min_class_count = data["Email_Type"].value_counts().min()

# Sample `min_class_count` examples from each class
safe_emails = data[data["Email_Type"] == "Safe Email"].sample(n=min_class_count, random_state=42)
phishing_emails = data[data["Email_Type"] == "Phishing Email"].sample(n=min_class_count, random_state=42)

# Combine the balanced samples into a new DataFrame
balanced_df = pd.concat([safe_emails, phishing_emails])

# Shuffle the balanced dataset to ensure random order
data = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the balanced class distribution
print(data["Email_Type"].value_counts())

Email_Type
Phishing Email    7291
Safe Email        7291
Name: count, dtype: int64


In [27]:
# emails = data['Email_Text'].tolist() # Keeps emails as list of strings
# labels = (data['Email_Type'] == 'Phishing Email').astype(int).tolist() # Converts labels into binary values (1 for phishing, 0 for safe) and then to a list
# label_tensor = torch.tensor(labels, dtype=torch.float32) # Convert labels to a PyTorch tensor

In [28]:
# Check for NaN values in the Email_Text column
print(data['Email_Text'].isnull().sum())
# Fill NaN values with an empty string
data['Email_Text'] = data['Email_Text'].fillna("")  # Alternatively, you can drop NaN rows
# Convert the Email_Text column to a list
emails = data['Email_Text'].tolist()  # Now this should be a list of strings without NaNs
# Convert labels to binary values (1 for phishing, 0 for safe)
labels = (data['Email_Type'] == 'Phishing Email').astype(int).tolist()

19


In [29]:
print(len(emails))
print(len(labels))

14582
14582


In [30]:
# Define special tokens
START_TOKEN = '<start>'
END_TOKEN = '<end>'
PADDING_TOKEN = '<pad>'

# Maximum sequence length
MAX_SEQ_LENGTH = 256

def clean_email(email):
    # Convert to lowercase
    email = email.lower()
    # Remove special characters (keep only letters, numbers, and whitespace)
    email = re.sub(r'[^a-z0-9\s]', '', email)
    return email

def tokenize_and_prepare(emails):
    tokenized_sequences = []
    # [email1, email2, ...,]
    for email in emails:
       
        cleaned_email = clean_email(email)

        # Tokenize the cleaned email
        tokens = word_tokenize(cleaned_email)
        # Add start and end tokens
        tokens = [START_TOKEN] + tokens + [END_TOKEN]

        # Truncate or pad to MAX_SEQ_LENGTH
        if len(tokens) > MAX_SEQ_LENGTH:
            tokens = tokens[:MAX_SEQ_LENGTH]  # Cut off to MAX_SEQ_LENGTH
        elif len(tokens) < MAX_SEQ_LENGTH:
            # Pad the sequence with the padding token
            tokens += [PADDING_TOKEN] * (MAX_SEQ_LENGTH - len(tokens))

        tokenized_sequences.append(tokens)

    return tokenized_sequences

In [31]:
# Process the emails
tokenized_sequences = tokenize_and_prepare(emails)

# First, convert tokens to unique indices (for simplicity, just using range for demo)
word_to_index = {token: i for i, token in enumerate(set([token for seq in tokenized_sequences for token in seq]))}

index_sequences = [[word_to_index[token] for token in seq] for seq in tokenized_sequences]

In [32]:
import pickle

# Save the dictionary to a file using pickle
with open('word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)

In [33]:
# Convert to tensor
tensor_sequences = torch.tensor(index_sequences)

# Convert the labels to tensors
labels = torch.tensor(labels)

In [34]:
# test_size => percent of training sets used for testing
X_train, X_test, y_train, y_test = train_test_split(tensor_sequences, labels, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

torch.Size([11665, 256])
torch.Size([2917, 256])
torch.Size([11665])
torch.Size([2917])


In [37]:
# Create class instances of the email dataset and loader
train_dataset = EmailDatasets(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

In [None]:
print(len(word_to_index))

In [52]:
# Check for available device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Assuming the following values based on your dataset and requirements
vocab_size = len(word_to_index)  # Number of unique tokens from your vectorizer
output_size = 2  # For binary classification
hidden_size = 128  # Default hidden size
num_heads = 4  # Default number of heads
max_seq_length = 256

Using device: cuda


In [75]:
# Initialize the model and move it to the device
model = NanoTransformer(num_emb=vocab_size, output_size=output_size, hidden_size=hidden_size, num_heads=num_heads)
model = model.to(device)
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

In [76]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for batch_emails, batch_labels in train_loader:
        # Move inputs and labels to the device
        # print(batch_emails)
        # print(batch_labels)
        batch_emails = batch_emails.to(device, dtype=torch.long)  # Ensure inputs are on device and of type long
        batch_labels = batch_labels.to(device, dtype=torch.long)  # Ensure labels are on device and of type long

        optimizer.zero_grad()  # Zero the gradients
        outputs = model(batch_emails)  # Forward pass

        # Extract the first index (class token) for loss calculation
        class_token_outputs = outputs[:, 0, :]  # Select the outputs for the class token

        # Calculate the loss using the outputs of the class token
        loss = criterion(class_token_outputs, batch_labels)  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the weights

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 0.3579
Epoch [2/10], Loss: 0.2285
Epoch [3/10], Loss: 0.1788
Epoch [4/10], Loss: 0.0957
Epoch [5/10], Loss: 0.1133
Epoch [6/10], Loss: 0.0511
Epoch [7/10], Loss: 0.0715
Epoch [8/10], Loss: 0.0258
Epoch [9/10], Loss: 0.0213
Epoch [10/10], Loss: 0.0164


In [55]:
# Initialize a testing loader
# Create class instances of the email dataset and loader
test_dataset = EmailDatasets(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [77]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track loss and accuracy
total_loss = 0
correct_predictions = 0
total_samples = 0

# Disable gradient calculation
with torch.no_grad():
    for batch_emails, batch_labels in test_loader:
        # Move inputs and labels to the device (if using a GPU)
        batch_emails = batch_emails.to(device, dtype=torch.long)
        batch_labels = batch_labels.to(device, dtype=torch.long)

        outputs = model(batch_emails)  # Forward pass

        # Extract the first index (class token) for loss calculation
        class_token_outputs = outputs[:, 0, :]

        # Calculate the loss
        loss = criterion(class_token_outputs, batch_labels)
        total_loss += loss.item()

        # Get predicted classes (using argmax or similar method)
        _, predicted = torch.max(class_token_outputs, dim=1)

        # Update correct predictions and total samples
        correct_predictions += (predicted == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

# Calculate average loss and accuracy
average_loss = total_loss / len(test_loader)
accuracy = correct_predictions / total_samples

print(f'Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Test Loss: 0.2541, Test Accuracy: 0.9414


In [80]:
# # Define special tokens
# START_TOKEN = '<start>'
# END_TOKEN = '<end>'
# PADDING_TOKEN = '<pad>'

# # Maximum sequence length
# MAX_SEQ_LENGTH = 256

# Predefined word_to_index dictionary
# (Assuming this was created during model training)
# word_to_index = {token: i for i, token in enumerate(set([token for seq in tokenized_sequences for token in seq]))}


# def clean_email(email):
#     # Convert to lowercase and clean up special characters
#     email = email.lower()
#     return email

def preprocess_text(text):
    # Define stop words in English
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Convert to lowercase and remove stop words
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    processed_text = ' '.join(filtered_words)
    
    return processed_text


def tokenize_and_prepare_single(email):
    # Clean the email
    cleaned_email = clean_email(email)
    
    # Tokenize and add special tokens
    tokens = [START_TOKEN] + word_tokenize(cleaned_email) + [END_TOKEN]

    # Truncate or pad to MAX_SEQ_LENGTH
    if len(tokens) > MAX_SEQ_LENGTH:
        tokens = tokens[:MAX_SEQ_LENGTH]
    else:
        tokens += [PADDING_TOKEN] * (MAX_SEQ_LENGTH - len(tokens))

    # Convert tokens to indices
    index_sequence = [word_to_index.get(token, word_to_index[PADDING_TOKEN]) for token in tokens]
    
    # Convert to tensor of shape (1, T)
    tensor_sequence = torch.tensor(index_sequence).unsqueeze(0)
    
    return tensor_sequence

def inference(email):
    # Preprocess and convert to tensor
    input_tensor = tokenize_and_prepare_single(email)
    
    # Pass the input through the model
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        output = model(input_tensor.to(device))
        output = output[:, 0, :]
    
    return output

import torch

# Assuming preprocess_text and inference are defined

# List of email bodies to process
email_bodies = [
    "Hello [Your Name], We detected suspicious activity on your account. To secure your information and prevent further unauthorized access, your account has been temporarily suspended. Please verify your account details to reactivate it and restore access. Immediate Action Required: Verify Account Now: [Hyperlink - fake link to a phishing site, e.g., http://secure-account-verify.com] Enter your account username and password on the next page to confirm your identity. Please note: If you do not respond within 24 hours, your account will be permanently suspended, and any stored information may be deleted. Thank you for your prompt attention. Sincerely, Customer Support Team Secure Online Services",
    
    "Hello Thank you for signing up with Our Company! We’re thrilled to have you on board and look forward to providing you with the best possible experience. To get started, here are a few steps you can take: Log In to Your Account: Visit our website at https://ourcompany.com/login and enter your username and password. Explore Our Features: Discover all the features we offer to help make your experience seamless and enjoyable. Contact Support: If you need any help, don’t hesitate to reach out! Our support team is here for you 24/7. Please feel free to reply to this email or call us at (123) 456-7890 with any questions. We hope you enjoy using Our Company! Best regards, The Our Company Team support@ourcompany.com (123) 456-7890",

    "Beautiful,Custom Websites $359 Complete! Beautiful, 100% Custom Websites, $359 Complete! Get beautiful, 100% Custom Web Site (or redesigned)В $359!*We references coast coastВ give plenty sites view! Includes 7 pages (you add more), java rollover buttons, feedback forms, more. constructed taste specifications. use templates,В sites completely custom.В В В *Must host us @ $19.95/mo (100 Megs, 20 Email accounts, Control Panel, Front Page, Graphical Statistics, more).В sites view, complete call message center 321-726-2209 (24 hours). call returned promptly.NOTE:В В using web based email program (such Yahoo, Hotmail, etc.) form work.В Instead using form,В CLICK .В В В Name: Phone w/AC*:State: Type Project: New Site:Redesign Flash Intro/bannerВ Current site?:Comments:В В wish receive messages, CLICK В (Please enter email addresses (in body message) wish eliminated future mailings.[8734IcUm5-941FzRj1099DvNh4-450ZrLD0601Vnia4-052sMEW9113qIaU3-562mEwQ9923iBvN2-17@75]",

    ">> get far road, people think >> creating spambayes package containing classifier tokenizer? >> minimize clutter site-packages. Guido> early IMO (if mean leave various tools Guido> it).Well, mentioned classifier tokenize thought importable modules. rest represent script-level code, right? Guido> package this, perhaps use Barry's trick Guido> email package making package toplevel Guido> dir distribution (rather requiring extra directory Guido> level package subdir distro).That would perfect. tried naive way last night, wound .py files package, intent.Skip",

    "Hello, I'm  Sir Leonard Valentinovich Blavatnik  a British businessman, investor, and philanthropist and president (Access Industries). I gave away 20 percent of my personal wealth to charity. And I also pledged to give away the rest of 20% this year 2024 to Individuals. I have decided to donate €570.000,00(Five Hundred and seventy Thousand Euros) to you. If you are interested in my donation, do contact me for more info. You can also read more about me via the link below https://en.m.wikipedia.org/wiki/Len_Blavatnik Warm Regards president(Access Industries)Sir Leonard Valentinovich Blavatnik",

    "Hey there, Hackers! Thank you for signing up for the NewHacks Hackathon on Devpost!  We’re excited to have you on board for an awesome weekend of coding, creativity, and collaboration. This is your chance to work on something amazing, connect with other students, and maybe even win some cool prizes! Here’s what you need to do next: Join the Discord Community: Our official Discord server is where all the action happens—team formation, announcements, mentorship, and more. Be sure to join using this link. Check the Schedule: Keep an eye on the timeline for important hackathon events like workshops, team building, and submission deadlines. Form Your Team: If you don’t have a team yet, don’t worry! Join the Discord server and meet others looking to collaborate.  We can’t wait to see your amazing ideas. Stay tuned for more details, and don’t hesitate to reach out if you have any questions. See you on Discord! Best,The NewHacks Team",

    "This email is from Elon Musk and the Founder, CEO and Chief Engineer of the SpaceX team; early-stage investor, CEO and product architect of Tesla, Inc.; Founder of The Boring Company; and co-founder of Neuralink and OpenAI. With an estimated net worth of around $245 billion. Your email address was randomly selected from the US,Canada and Europe e-mail database and you've won 18,087.71 Tesla Stock at $228.0 per share (TSLA) valued at $4,124,270.00. Please take this email seriously by replying to this email or by contacting (teslastockfoundation.team4@yandex.com) for claim.",

    "We want to inform you about an upcoming change to your Paperspace communications. As you may have heard, Paperspace has been acquired by DigitalOcean, a leading cloud computing platform trusted by millions of developers worldwide. What This Means for You Your existing Paperspace services, projects, and workspaces will continue to operate without interruption. You'll benefit from DigitalOcean's world-class support team and extensive documentation. Coming Changes Starting next week, you'll notice that your communications will come from DigitalOcean instead of Paperspace. Your login credentials will remain the same. Rest assured, there will be no changes to how you access your Paperspace console - you can continue to access all your workspaces and projects through console.paperspace.com as usual. Thank you for being a valued Paperspace customer. We're excited to continue serving you as part of the DigitalOcean family. Best regards,  The DigitalOcean Team",

    "Thank you for signing up for Fall Career Fair! This is a gentle reminder that the event is coming up soon. Please review the following information: Time: Fri, Oct 4, 2024, 10:00 A Location: The Carlu (444 Yonge St #7) We're excited to see you there! For real-time updates on events and event logistics, make sure to follow our social media Cheers, YNCN Team",

    "Hi Kenny Now that you have registered for the VCT Hackathon: Esports Manager Challenge, it is time to start formulating an idea and building your project! Here are a few places to get started if you haven't done so already: 1.) Join the Devpost Discord and check in to the hackathon channel to brainstorm ideas and get technical help and tips by connecting with the developer community 2.) Looking to form a team? If you want to work in a team to create your project, navigate over to the Participants tab and search for other participants looking for teammates. You can message them right from the platform! 3.) Check out the Resources Tab to: Find upcoming workshops and office hours Review technical resources to help you on your building journey Explore ways to connect with the community and get your questions answered 4.) Don't forget - this hackathon also is providing a bonus opportunity to score prizes by providing feedback If you have any hackathon questions, please feel free to reach out via the Discussion Forum or Discord! Happy Building! Best, Cassie Devpost 222 Broadway New York, NY 10038 Want to learn more about hackathons? Join our Discord server!",

    "Hi, my name is Raad Ahammad, and I am a recent CS graduate from University of Texas at Dallas. I am new in participating in hackathons and I am seeking for teammates. I prefer to at least have 1 member to be experienced with hackathons. I can see that you completed 4 projects and share my interest in cybersecurity, machine learning, and programming. The languages I've used for programming include Java, Python, and C++. Please feel free to accept my request. If you're interested in working together, please get in touch by replying to this email or using my email address: raadahammad@gmail.com. If this doesn't sound like a good match, let me know or feel free to ignore this request. Thanks, raadahammad This email was sent to you because you've indicated you're looking for teammates on VCT Hackathon: Esports Manager Challenge. You can change your teammate settings for this hackathon here. If this message contains spam or unwanted messages let us know at support@devpost.com."
]

# Set print options for better visibility
torch.set_printoptions(precision=6, sci_mode=False)

# Loop over each email body
for i, email_body in enumerate(email_bodies, start=1):
    print(email_body)
    processed_text = preprocess_text(email_body)  # Preprocess the email body
    print(processed_text)
    result = inference(processed_text)  # Run inference
    result = torch.nn.functional.softmax(result, dim=1).float()  # Apply softmax
    
    # Print the results
    print(f"Email {i} result: {result}")
    if result[0][0] > result[0][1]:
        print("Not phishing")
    else:
        print("Phishing")
    print("\n")  # Print a newline for better readability between emails


Hello [Your Name], We detected suspicious activity on your account. To secure your information and prevent further unauthorized access, your account has been temporarily suspended. Please verify your account details to reactivate it and restore access. Immediate Action Required: Verify Account Now: [Hyperlink - fake link to a phishing site, e.g., http://secure-account-verify.com] Enter your account username and password on the next page to confirm your identity. Please note: If you do not respond within 24 hours, your account will be permanently suspended, and any stored information may be deleted. Thank you for your prompt attention. Sincerely, Customer Support Team Secure Online Services
hello [ name ] , detected suspicious activity account . secure information prevent unauthorized access , account temporarily suspended . please verify account details reactivate restore access . immediate action required : verify account : [ hyperlink - fake link phishing site , e.g. , http : //secur

In [79]:
# Save the model's state_dict
torch.save(model.state_dict(), 'model_weights.pth')

In [43]:
state_dict = torch.load('/kaggle/input/model-weights/phishing_detection.pth')

# Load the state_dict into the model
model.load_state_dict(state_dict)

  state_dict = torch.load('/kaggle/input/model-weights/phishing_detection.pth')


<All keys matched successfully>