In [63]:
import pandas as pd
import os
import json
import re
import nltk
import string
import shutil
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('wordnet')

# Safe Email = 0
# Phishing Email = 1



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jackt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Data Collection and Processing

Kaggle Email Dataset

In [64]:
ceas = pd.read_csv('Datasets/Email/CEAS_08.csv')
ling = pd.read_csv('Datasets/Email/Ling.csv')
nazario = pd.read_csv('Datasets/Email/Nazario.csv')
nigerian = pd.read_csv('Datasets/Email/Nigerian_Fraud.csv')
spamAssasin = pd.read_csv('Datasets/Email/SpamAssasin.csv')

dropColumns = ['sender','receiver','date','urls']
ceas  = ceas.drop(dropColumns,axis=1)
nazario = nazario.drop(dropColumns,axis=1)
nigerian = nigerian.drop(dropColumns,axis=1)
spamAssasin = spamAssasin.drop(dropColumns,axis=1)
print('Ceas Emails: '+ str(ceas.shape[0]))
print('Nazario Emails: '+ str(nazario.shape[0]))
print('Nigerian Scammer Emails: '+ str(nigerian.shape[0]))
print('Spam Assasin Emails: '+ str(spamAssasin.shape[0]))

Ceas Emails: 39154
Nazario Emails: 1565
Nigerian Scammer Emails: 3332
Spam Assasin Emails: 5809


Enron Email Dataset

In [65]:
enron = pd.read_csv('Datasets/Email/Enron.csv')
print('Enron Emails: '+ str(enron.shape[0]))
enron.head()


Enron Emails: 29767


Unnamed: 0,subject,body,label
0,"hpl nom for may 25 , 2001",( see attached file : hplno 525 . xls )\r\n- h...,0
1,re : nom / actual vols for 24 th,- - - - - - - - - - - - - - - - - - - - - - fo...,0
2,"enron actuals for march 30 - april 1 , 201","estimated actuals\r\nmarch 30 , 2001\r\nno flo...",0
3,"hpl nom for may 30 , 2001",( see attached file : hplno 530 . xls )\r\n- h...,0
4,"hpl nom for june 1 , 2001",( see attached file : hplno 601 . xls )\r\n- h...,0


Spear Fishing Email Dataset

In [66]:
spear = pd.DataFrame()

directory = 'Datasets/Email/spear_phishing'
for file in os.listdir(directory):
    filepath = os.path.join(directory, file)
    with open(filepath, 'r') as f:
        data = json.load(f)
    temp = pd.DataFrame([data])
    spear = pd.concat([spear, temp], ignore_index=True) 

spear = spear.drop(['sender_name'],axis=1)  
spear = spear.rename(columns={'email_subject': 'subject', 'email_body': 'body'})
spear['label'] = 0

print('Spear Phishing Emails: '+ str(spear.shape[0]))

spear.head()

Spear Phishing Emails: 334


Unnamed: 0,subject,body,label
0,Important Information - Action Required,<html><body style='background-color:white;'><h...,0
1,Invitation to Cutting-Edge Cybersecurity Webinar,"<html><body><h3>Dear Gale Robinson,</h3><p>I h...",0
2,Important Announcement from SafeSecurity,"<html><body><p>Dear Jennifer,</p><br><p>I hope...",0
3,Important Information Regarding Cybersecurity,"<html><body><h3>Dear Ethan Hawk,</h3><p>I hope...",0
4,Important Update Regarding Cybersecurity,"<html><body><h2>Dear Moses Sharp,</h2><p>We ho...",0


SMS Dataset

In [67]:
# Tidy up SMS data so it can be concat on email data
sms = pd.read_csv('Datasets/SMS/Dataset_5971.csv')
sms = sms.drop(['URL','EMAIL','PHONE'],axis=1)

sms = sms.rename(columns={'LABEL': 'label', 'TEXT': 'body'})
sms = sms[['body', 'label']]

# Change all labels to match labels I am using in email dataset
sms['label'] = sms['label'].replace(['ham'], 0)
sms['label'] = sms['label'].replace(['Spam','Smishing','smishing','spam'], 1)

print('SMS Messages: '+ str(sms.shape[0]))
sms.head(-5)

SMS Messages: 5971


  sms['label'] = sms['label'].replace(['Spam','Smishing','smishing','spam'], 1)


Unnamed: 0,body,label
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,0
1,What's up? Do you want me to come online? If y...,0
2,So u workin overtime nigpun?,0
3,"Also sir, i sent you an email about how to log...",0
4,Please Stay At Home. To encourage the notion o...,1
...,...,...
5961,Kay... Since we are out already,0
5962,Ü log off 4 wat. It's sdryb8i,0
5963,call now 08707509020 Just 20p per min NTT Ltd...,1
5964,Are you angry with me. What happen dear,0


Combining Datasets

In [68]:
# Concat all datasets
combined = pd.concat([ceas,enron,ling,nazario,nigerian,spamAssasin,spear,sms],axis=0)
print('Total Data Points: '+ str(combined.shape[0]))
combined.head(-5)

Total Data Points: 88791


Unnamed: 0,subject,body,label
0,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1
1,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1
2,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1
3,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0
4,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1
...,...,...,...
5961,,Kay... Since we are out already,0
5962,,Ü log off 4 wat. It's sdryb8i,0
5963,,call now 08707509020 Just 20p per min NTT Ltd...,1
5964,,Are you angry with me. What happen dear,0


Cleaning and Normalising Datasets

In [69]:
# Combine subject and body
combined.fillna({'subject': ''}, inplace=True)
combined.fillna({'body': ''}, inplace=True)
combined['text'] = combined['body'] + ' ' + combined['subject']
combined = combined.drop(['subject','body'],axis=1)

# Remove NaN and blank
combined = combined[combined['text'] != '']
null_count = combined.isnull().sum().sum()
print('Number of null values:\n', null_count)
blank_count = (combined['text']=='').sum()
print('Number of blank values:\n',blank_count)

# Remove duplicates
combined.drop_duplicates(subset='text', keep='first', inplace=True)
dupe_count = combined.duplicated(keep='first').sum()
print('Number of dupe:\n', dupe_count)

# Extract embedded links
def extract_urls(text):
    url_pattern = re.compile(r'http[s]?://\S+')
    urls = url_pattern.findall(text)
    return urls

def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://\S+')
    return url_pattern.sub('', text)

#combined['urls'] = combined['text'].apply(extract_urls)
combined['text'] = combined['text'].apply(remove_urls)

# Training the body classification model only requires email bodies and does not need links
# Different model will be trained to classify links using the link dataset
# So for the training set links will be removed

def clean_text(text):
    # punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # numbers
    text = re.sub(r'\d+', '', text)
    # multiple whitespace
    text = re.sub(r'\s+', ' ', text)
    # special characters
    text = re.sub(r'[^\w\s]', '', text)
    # lower case
    text = text.lower()
    # stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # stemming
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

# remove html tags, keep form tags maybe
def remove_html_tags_excluding(text, exclude_tags=['form']):
    soup = BeautifulSoup(text, "html.parser")
    exclude_tags = [tag.lower() for tag in exclude_tags]

    # Remove all tags except the ones in the exclude list
    for tag in soup.find_all(True):
        if tag.name.lower() not in exclude_tags:
            tag.unwrap()

    return str(soup)

# remove infrequent words


# See if Keras can clean and normalise for me
combined['text'] = combined['text'].apply(clean_text)
combined['text'] = combined['text'].apply(remove_html_tags_excluding)

combined.head()


Number of null values:
 0
Number of blank values:
 0
Number of dupe:
 0


Unnamed: 0,label,text
0,1,buck trouble caused small dimension soon becom...
1,1,upgrade sex pleasure technique befriend jenna ...
2,1,daily top cnncom top video story aug pm edt to...
3,0,would anyone object removing list tld basicall...
4,1,welcomefastshippingcustomersupport specialpric...


Balance label weightings

In [70]:
# Split the dataset into safe and phishing emails
safeEmails = combined.loc[combined['label'] == 0]
phishingEmails = combined.loc[combined['label'] == 1]
print('Safe Emails: ',str(safeEmails.shape[0]))
print('Phishing Emails: ',str(phishingEmails.shape[0]))

# Shrink datasets to the same size to ensure fairness in training
safeEmails = safeEmails.head(phishingEmails.shape[0])

print('Safe Emails: ',str(safeEmails.shape[0]))
print('Phishing Emails: ',str(phishingEmails.shape[0]))

combined = pd.concat([safeEmails,phishingEmails],axis=0)
print('Combined Emails: ',str(combined.shape[0]))
combined.head()

Safe Emails:  44763
Phishing Emails:  44006
Safe Emails:  44006
Phishing Emails:  44006
Combined Emails:  88012


Unnamed: 0,label,text
3,0,would anyone object removing list tld basicall...
8,0,wrzzpvsidneycom changed removed added otherbug...
15,0,plelim remind certificate needed hurry expedit...
18,0,carlos e r wrote begin pgp signed message hash...
19,0,steve jacob wrote forwarded message steve jaco...


Write data to two folders in with each email being a .txt

for index, row in combined.iterrows():
    label = row['label']
    text = row['text']
    
    file_name = f"email_{index}.txt"

    if label == 0:
        directory = 'Keras/benign'
    elif label == 1:
        directory = 'Keras/malicious'

    file_path = f'{directory}/{file_name}'

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)


Save Dataframe as CSV

In [71]:
combined = combined.reset_index(drop=True)
combined = combined[['text','label']]
combined.to_csv('Datasets/ProcessedEmails.csv',index=True)