## Import Libraries

In [1]:
import os
import random
import pandas as pd
import numpy as np
import raw_utils as util
import eml_parser
import csv
random.seed(1746)

## Phishing

### Nazario Phishing Corpus

In [2]:
# Paths
cwd = os.getcwd()
nazario_path = os.path.join(cwd, 'datasets/phishing/nazario/')
enron_path = os.path.join(cwd, 'datasets/legitimate/enron/')

csv_path = os.path.join(cwd, 'datasets/csv/')

In [3]:
# Files to be ignored for read_dataset()
files_ignored_recent = ['README.txt', '20051114.mbox',  'phishing0.mbox',  'phishing1.mbox',  'phishing2.mbox',  'phishing3.mbox', 'private-phishing4.mbox']

In [4]:
phishingMessageBodies = []
phishing_recent = util.read_dataset(nazario_path, files_ignored_recent, text_only=True)
phishingMessageBodies = phishing_recent['body'].tolist()

Now reading file: phishing-2016




Now reading file: phishing-2019
Now reading file: phishing-2021
Now reading file: phishing-2018
Now reading file: phishing-2020
Now reading file: phishing-2024
Now reading file: phishing-2023
Now reading file: phishing-2015
Now reading file: phishing-2022
Now reading file: phishing-2017


In [5]:
print("Phishing messages: ", len(phishingMessageBodies))
phishing_recent.shape

Phishing messages:  2770


(2770, 1)

In [6]:
util.save_to_csv(phishing_recent, csv_path, 'nazario_recent.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/nazario_recent.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/nazario_recent.csv will be overwritten.


In [7]:
# Clair Fraud Email Database
added = []
with open("datasets/phishing/fradulent_emails.txt", 'r', errors="ignore") as f:
    body = ""
    inBody = False
    for line in f:
        if line.startswith("Status: O"):
            inBody = True
        
        elif line.startswith("From r") and len(body) > 0:
            inBody = False
            added.append(body)
            body = ""

        elif inBody:
            body += line


# convert list to dataframe
phishing_clair = pd.DataFrame(added, columns=['body'])
phishingMessageBodies = list(set(phishingMessageBodies + [a for a in added if len(a) > 0]))
print(len(phishingMessageBodies))

3740


In [8]:
phishing_clair.head()

Unnamed: 0,body
0,\nFROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233...
1,"\nDear Friend,\n\nI am Mr. Ben Suleman a custo..."
2,"\nDear Sir,\n\nI am Barrister Tunde Dosumu (SA..."
3,\nFROM: WILLIAM DRALLO.\nCONFIDENTIAL TEL: 233...
4,"\nCHALLENGE SECURITIES LTD.\nLAGOS, NIGERIA\n\..."


In [10]:
util.save_to_csv(phishing_clair, csv_path, 'fradulent_emails.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/fradulent_emails.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/fradulent_emails.csv will be overwritten.


In [11]:
# SpamAssassin Spam (not exactly phishing, but NVIDIA article used it as phishing so attempting it)
ep = eml_parser.EmlParser(include_raw_body=True)

spamDir = "datasets/phishing/spam_2/spam_2/"
spamFilenames = [os.path.join(spamDir, f) for f in os.listdir(spamDir)]

added = []
files_ignored = []

for filename in spamFilenames:
    try:
        with open(filename, "rb") as f:
            b = f.read()
        
        m = ep.decode_email_bytes(b)
        if len(m["body"]) >= 1:
            body_content = m["body"][0]["content"]
            content_type = m["body"][0].get("content_type", "text/plain")

            # Kiểm tra loại nội dung (text/plain hoặc text/html)
            if content_type == "text/plain":
                added.append(body_content.strip())
            elif content_type == "text/html":
                clean_content = util.parse_html(body_content)
                added.append(clean_content)
        else:
            files_ignored.append(filename)
    except Exception as e:
        files_ignored.append(filename)

print("Files ignored due to errors:", files_ignored)

    
spam = pd.DataFrame(added, columns=['body'])
phishingMessageBodies = list(set(phishingMessageBodies + added))
print(len(phishingMessageBodies))

FROM header parsing failed.
FROM header parsing failed.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
FROM header parsing failed.
FROM header parsing failed.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
FROM header parsing failed.


Files ignored due to errors: ['datasets/phishing/spam_2/spam_2/01214.973b4598b630a989967ff69b19f95d4a', 'datasets/phishing/spam_2/spam_2/00357.049b1dd678979ce56f10dfa9632127a3']
4929


In [12]:
spam.head()

Unnamed: 0,body
0,Hunza Bread\nHunza Bread\nHome made Hunza Brea...
1,Learn from the BEST...for FREE! \n\nLearn to l...
2,NOW ON SALE FOR $129.00 10 100 MG TABLETS\n\n...
3,NEW PRODUCT ANNOUNCEMENT\n\nFrom: OUTSOURCE EN...
4,"Hello ~name~,\n\nIf you ordered a flat-rate do..."


In [14]:
attempted_filename = os.path.join(csv_path, 'spamassassin.csv')
if os.path.exists(attempted_filename):
    print("File", attempted_filename, "already exists.")
    overwrite = input("Do you want to overwrite it? (y/n) ")
    if (overwrite == 'Y' or overwrite == 'y'):
        print("File", attempted_filename, "will be overwritten.")
        spam.to_csv(os.path.join(csv_path, 'spamassasin.csv'), escapechar='\\')
    else:
        print("Aborting, data will not be written.")
else:
    print("Saving to", attempted_filename)
    spam.to_csv(os.path.join(csv_path, 'spamassasin.csv'), escapechar='\\')


Saving to /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/spamassassin.csv


In [15]:
phishingDataFrame = pd.DataFrame(phishingMessageBodies, columns=['body'])

In [17]:
attempted_filename = os.path.join(csv_path, 'raw_phishing.csv')
if os.path.exists(attempted_filename):
    print("File", attempted_filename, "already exists.")
    overwrite = input("Do you want to overwrite it? (y/n) ")
    if (overwrite == 'Y' or overwrite == 'y'):
        print("File", attempted_filename, "will be overwritten.")
        phishingDataFrame.to_csv(os.path.join(csv_path, 'raw_phishing.csv'), escapechar='\\')
    else:
        print("Aborting, data will not be written.")
else:
    print("Saving to", attempted_filename)
    phishingDataFrame.to_csv(os.path.join(csv_path, 'raw_phishing.csv'), escapechar='\\')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/raw_phishing.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/raw_phishing.csv will be overwritten.


## Legitimate

### Easy ham and Hard ham

In [18]:
legitimateMessageBodies = []

In [19]:
ep = eml_parser.EmlParser(include_raw_body=True)

easyHamDir = "datasets/legitimate/easy_ham/easy_ham/"
hardHamDir = "datasets/legitimate/hard_ham/hard_ham/"
hamFilenames = [os.path.join(easyHamDir, f) for f in os.listdir(easyHamDir)] + [os.path.join(hardHamDir, f) for f in os.listdir(hardHamDir)]

files_ignored = []

for filename in hamFilenames:
    with open(filename, "rb") as f:
        b = f.read()
    
    m = ep.decode_email_bytes(b)
    if len(m["body"]) >= 1:
        body_content = m["body"][0]["content"]
        content_type = m["body"][0].get("content_type", "text/plain")

        # Kiểm tra loại nội dung (text/plain hoặc text/html)
        if content_type == "text/plain":
            legitimateMessageBodies.append(body_content.strip())
        elif content_type == "text/html":
            clean_content = util.parse_html(body_content)
            legitimateMessageBodies.append(clean_content)
    else:
        files_ignored.append(filename)
print("Files ignored due to errors:", files_ignored)


benign_mails = pd.DataFrame(legitimateMessageBodies, columns=['body'])
print("Legitimate messages:", len(legitimateMessageBodies))

Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.
Header field "message-id" is in an invalid format and cannot be fixed, it will be dropped.

Files ignored due to errors: []
Legitimate messages: 2578


In [20]:
benign_mails.head()

Unnamed: 0,body
0,Matthias Saou (matthias@rpmforge.net) wrote*:\...
1,|:::::::::::::::::::::::::::::::::::::::::::::...
2,"Hello Bill,\n\nMonday, September 30, 2002, 5:4..."
3,>>>Chris Garrigues said:\n > > From: Brent We...
4,"from slate's ""today's papers"": \nThe New York..."


In [21]:
util.save_to_csv(benign_mails, csv_path, 'easy_hard_ham.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/easy_hard_ham.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/easy_hard_ham.csv will be overwritten.


### Enron mails

In [22]:
filename = util.sample_enron_to_mbox(enron_path, 4000)
enron_4000 = util.mbox_to_df(filename, enron_path+'/mbox', text_only=True)
legitimateMessageBodies += enron_4000['body'].tolist()
util.save_to_csv(enron_4000, csv_path, 'enron_text_4000.csv')

3028 folders will be checked.
300452 emails found.
Extracting 4000 random emails.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/legitimate/enron/mbox/enron_4000.mbox will be overwritten.
1 emails skipped: Headers contain non-ascii characters, or otherwise corrupted email data.
/home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/legitimate/enron/mbox/enron_4000.mbox was created successfully.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/enron_text_4000.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/enron_text_4000.csv will be overwritten.


In [23]:
print(len(legitimateMessageBodies))

6578


In [24]:
legitimateDataFrame = pd.DataFrame(legitimateMessageBodies, columns=['body'])

In [25]:
util.save_to_csv(legitimateDataFrame, csv_path, 'raw_legitimate.csv')

Saving to /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/raw_legitimate.csv
