In [1049]:
import pandas as pd
import os
import json
import re
import nltk

# from bs4 import BeautifulSoup

# Safe Email = 0
# Phishing Email = 1



Data Collection and Processing

Kaggle Email Dataset

In [1050]:
ceas = pd.read_csv('Datasets/Email/CEAS_08.csv')
ling = pd.read_csv('Datasets/Email/Ling.csv')
nazario = pd.read_csv('Datasets/Email/Nazario.csv')
nigerian = pd.read_csv('Datasets/Email/Nigerian_Fraud.csv')
spamAssasin = pd.read_csv('Datasets/Email/SpamAssasin.csv')

dropColumns = ['sender','receiver','date','urls']
ceas  = ceas.drop(dropColumns,axis=1)
nazario = nazario.drop(dropColumns,axis=1)
nigerian = nigerian.drop(dropColumns,axis=1)
spamAssasin = spamAssasin.drop(dropColumns,axis=1)
print(ceas.shape)
print(nazario.shape)
print(nigerian.shape)
print(spamAssasin.shape)

(39154, 3)
(1565, 3)
(3332, 3)
(5809, 3)


Enron Email Dataset

In [1051]:
enron = pd.read_csv('Datasets/Email/Enron.csv')
print(enron.shape)
enron.head()


(29767, 3)


Unnamed: 0,subject,body,label
0,"hpl nom for may 25 , 2001",( see attached file : hplno 525 . xls )\r\n- h...,0
1,re : nom / actual vols for 24 th,- - - - - - - - - - - - - - - - - - - - - - fo...,0
2,"enron actuals for march 30 - april 1 , 201","estimated actuals\r\nmarch 30 , 2001\r\nno flo...",0
3,"hpl nom for may 30 , 2001",( see attached file : hplno 530 . xls )\r\n- h...,0
4,"hpl nom for june 1 , 2001",( see attached file : hplno 601 . xls )\r\n- h...,0


Spear Fishing Email Dataset

In [1052]:
spear = pd.DataFrame()

directory = 'Datasets/Email/spear_phishing'
for file in os.listdir(directory):
    filepath = os.path.join(directory, file)
    with open(filepath, 'r') as f:
        data = json.load(f)
    temp = pd.DataFrame([data])
    spear = pd.concat([spear, temp], ignore_index=True) 

spear = spear.drop(['sender_name'],axis=1)  
spear = spear.rename(columns={'email_subject': 'subject', 'email_body': 'body'})
spear['label'] = 0


spear.head()

Unnamed: 0,subject,body,label
0,Important Information - Action Required,<html><body style='background-color:white;'><h...,0
1,Invitation to Cutting-Edge Cybersecurity Webinar,"<html><body><h3>Dear Gale Robinson,</h3><p>I h...",0
2,Important Announcement from SafeSecurity,"<html><body><p>Dear Jennifer,</p><br><p>I hope...",0
3,Important Information Regarding Cybersecurity,"<html><body><h3>Dear Ethan Hawk,</h3><p>I hope...",0
4,Important Update Regarding Cybersecurity,"<html><body><h2>Dear Moses Sharp,</h2><p>We ho...",0


SMS Dataset

In [1053]:
# Tidy up SMS data so it can be concat on email data
sms = pd.read_csv('Datasets/SMS/Dataset_5971.csv')
sms = sms.drop(['URL','EMAIL','PHONE'],axis=1)

sms = sms.rename(columns={'LABEL': 'label', 'TEXT': 'body'})
sms = sms[['body', 'label']]

# Change all labels to match labels I am using in email dataset
sms['label'] = sms['label'].replace(['ham'], 0)
sms['label'] = sms['label'].replace(['Spam','Smishing','smishing','spam'], 1)

print(sms.shape)
sms.head(-5)

# Can I use this without a subject column?

(5971, 2)


  sms['label'] = sms['label'].replace(['Spam','Smishing','smishing','spam'], 1)


Unnamed: 0,body,label
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,0
1,What's up? Do you want me to come online? If y...,0
2,So u workin overtime nigpun?,0
3,"Also sir, i sent you an email about how to log...",0
4,Please Stay At Home. To encourage the notion o...,1
...,...,...
5961,Kay... Since we are out already,0
5962,Ü log off 4 wat. It's sdryb8i,0
5963,call now 08707509020 Just 20p per min NTT Ltd...,1
5964,Are you angry with me. What happen dear,0


Combining Datasets

In [1054]:
# Concat all datasets
combined = pd.concat([ceas,enron,ling,nazario,nigerian,spamAssasin,spear,sms],axis=0)
combined.head(-5)

Unnamed: 0,subject,body,label
0,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1
1,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1
2,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1
3,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0
4,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1
...,...,...,...
5961,,Kay... Since we are out already,0
5962,,Ü log off 4 wat. It's sdryb8i,0
5963,,call now 08707509020 Just 20p per min NTT Ltd...,1
5964,,Are you angry with me. What happen dear,0


Cleaning and Normalising Datasets

In [1055]:
# Combine subject and body
combined.fillna({'subject': ''}, inplace=True)
combined.fillna({'body': ''}, inplace=True)
combined['text'] = combined['body'] + ' ' + combined['subject']
combined = combined.drop(['subject','body'],axis=1)

# Remove NaN and blank
combined = combined[combined['text'] != '']
null_count = combined.isnull().sum().sum()
print('Number of null values:\n', null_count)
blank_count = (combined['text']=='').sum()
print('Number of blank values:\n',blank_count)

# Remove duplicates
combined = combined.drop_duplicates(subset='text', keep='first')
dupe_count = combined.duplicated(keep='first').sum()
print('Number of dupe:\n', dupe_count)

# Normalise text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Latency, make faster?
combined['text'] = combined['text'].apply(clean_text)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

combined['text'] = combined['text'].apply(tokenize_and_remove_stopwords)

combined.head(-5)


Number of null values:
 0
Number of blank values:
 0
Number of dupe:
 0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: module 'nltk' has no attribute 'stopwords'

Splitting Datasets

In [None]:
# Split the dataset into safe and phishing emails
safeEmails = combined.loc[combined['label'] == 0]
phishingEmails = combined.loc[combined['label'] == 1]

# Shrink datasets to the same size to ensure fairness in training
safeEmails = safeEmails.sample(frac = 1)
phishingEmails = phishingEmails.sample(frac = 1)
phishingEmails = phishingEmails.head(safeEmails.shape[0])

Data Cleaning

URLs

In [None]:
urls = pd.read_csv('Datasets/URL/phishingAndLegitURL.csv')
urlData = urls.drop(['FILENAME', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
       'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
       'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
       'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
       'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
       'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
       'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
       'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
       'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
       'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
       'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
       'NoOfEmptyRef', 'NoOfExternalRef'],axis=1)
urlData

# 1 = safe  0 = phishing

Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1
...,...,...
235790,https://www.skincareliving.com,1
235791,https://www.winchester.gov.uk,1
235792,https://www.nononsensedesign.be,1
235793,https://patient-cell-40f5.updatedlogmylogin.wo...,0
