In [153]:
# Standard libraries
import os
import bz2
import tarfile
import glob
from copy import deepcopy

# Data Cleansing
import pandas as pd
import re

# URL libs
import requests
from bs4 import BeautifulSoup

In [218]:
# Global variables to specify where to get the email data (lib_url) and where to save it (main_dir, sub_dir)
# These will be referenced in the data curation phase of execution.
lib_url = "https://spamassassin.apache.org/old/publiccorpus/"
main_dir = "data"
sub_dir = "extracted"

In [136]:
def create_directory_structure(main_dir, sub_dir):
    """
    Function to create the directory structure on disk in the case that it doesn't already exist
    The directory will be created in the same directory as the source file and will use the structure main_dir/sub_dir
    Input:
        main_dir: the top level of the directory structure
        sub_dir: sublevel in the directory structure
    Returns:
        No return.
    """
    try:
        os.mkdir(".\\" + main_dir)
    except:
        print("Directory already exists.")
        try:
            os.mkdir(".\\"+ main_dir +"\\"+ sub_dir +"\\")
        except:
            print("Directory already exists.")
    print("Directory Structure Created")

In [139]:
def download_email_records(url):
    """
    Function to download the page source from online directory and create a list of filenames with .tar.bz2 extensions
    Input:
        url: the lib_url defined in the source
    Returns:
        downloadable: a list of urls made of the lib_url web address and the filenames extracted from the hyperlines in the page source.
    """
    soup = BeautifulSoup(requests.get(lib_url).text)
    urls = soup.find_all('a')
    filenames = [url['href'] for url in urls if "bz2" in str(url)]
    downloadable = [lib_url + filename for filename in filenames]
    print("Email archive urls extracted")
    return downloadable

In [138]:
def save_email_records(file_urls):
    """
    Function to download the email archives and write them to disk in the directory hierarchy
    Input:
        file_urls: a list of urls pointing to the email archives
    Returns:
        No return.
    """
    for i, url in enumerate(file_urls):
        dl = requests.get(url, allow_redirects = True)
        open(".//" + main_dir + "//"+filenames[i], 'wb').write(dl.content)
    print("Email archives downloaded from url")

In [137]:
def extract_email_records(main_dir, sub_dir):
    """
    Function to extract the email records from the downloaded email archives.
    Loops through all files in each level of the directory hierarchy and extracts the data from all tar.bz2 archives to disk.
    Input:
        main_dir: the top level of the directory structure
        sub_dir: sublevel in the directory structure
    Returns:
        No return.
    """
    for filepath in glob.glob(".\\" + main_dir + "\\*.tar.bz2"):
        #zipfile = bz2.BZ2File(filepath)
        #data = zipfile.read()
        #newfile = filepath[:-4]
        #open(newfile, "wb").write(data)
        tar = tarfile.open(filepath, "r:bz2")
        tar.extractall(os.path.join(main_dir+ "\\"+ sub_dir, filepath[7:-8]))
        tar.close()
    print("Email records extracted")

In [140]:
def dl_and_create_email_records(url, main_dir, sub_dir):
    urls = download_email_records(url)
    create_directory_structure(main_dir = main_dir, sub_dir = sub_dir)
    save_email_records(file_urls = urls)
    extract_email_records(main_dir = main_dir, sub_dir = sub_dir)
    print("Email records downloaded & extracted")

In [141]:
def get_target_directory_details(target_dir):
    sub_directories = glob.glob(target_dir + "\\extracted\\*\\*")
#     print(target_dir)
#     print(os.path.join(target_dir, "\\extracted\\*\\*"))
#     print(sub_directories)
    names = [(x.split("\\")[-1], "HAM" if x.find("ham") >=0 else "SPAM") for x in sub_directories]
    email_type_names = list(zip(names, sub_directories))
    print("Target directories extracted")
    return email_type_names

In [142]:
def parse_email(email, line_names, target):
    """
    Take in the filename of an email document.
    Extract any information relating to the predefined tags
    Store any information after the subject line as body - to be further processed later
    """
    try:
        with open(email) as file:
            body_start = False # Changed to True after reading the subject tag.
            body = []
            value_dict = {}
            value_dict['target'] = target

            for line in file.readlines():
                line_start = line.split(":")[0]+":"
                if body_start:
                    body.append(line.strip())
                if line_start in line_names:
                    line_contents = re.findall(r":\s(.*)", line)[0]
                    value_dict[line_start] = line_contents
                if line_start == "Subject:":
                    body_start = True
            value_dict['body'] = "\n".join(body)
            return value_dict
    except Exception as e:
        print(f"{e}: Error: Can't read file {email}")

In [143]:
def get_email_target_mappings(main_dir):
    email_type_names = get_target_directory_details(".\\" + main_dir)
    directories = [x[1] for x in email_type_names]
    targets = [x[0][1] for x in email_type_names]
    target_mapping = list(zip(directories, targets))
    print("Target mappings extracted")
    return target_mapping

In [144]:
def get_directory_file_listing(dir_path):
    print(dir_path + "\\")
    return glob.glob(dir_path + "\\*")

In [145]:
def extract_email_data_to_dictionary(main_dir):
    line_names = ["To:", "From:", "MIME-Version:", "Content-Type:",
                 "Content-Transfer-Encoding:", "X-Mailer:", "Subject:",
                 "Precedence:"]

    target_mapping = get_email_target_mappings(main_dir = main_dir)
    email_contents = {}
    for target in target_mapping:
        for file in get_directory_file_listing(target[0]):
            email_contents[file.split("\\")[-1]] = parse_email(file, line_names, target[1])
    print("Emails extracted to dictionary")
    return email_contents

In [146]:
def convert_dict_to_dataframe(email_dict):
    df = pd.DataFrame.from_dict(email_dict).transpose().reset_index()
    print("DataFrame generated")
    return df

In [147]:
def generate_base_email_dataframe():
    email_dict = extract_email_data_to_dictionary(main_dir = main_dir)
    print("Base dataframe ready for cleansing")
    return convert_dict_to_dataframe(email_dict)

In [148]:
dl_and_create_email_records(lib_url, main_dir, sub_dir)
base_email_df = generate_base_email_dataframe()

Email archive urls extracted
Directory Structure Created
Email archives downloaded from url
Email records extracted
Email records downloaded & extracted
Target directories extracted
Target mappings extracted
.\data\extracted\20021010_easy_ham\easy_ham\
.\data\extracted\20021010_hard_ham\hard_ham\
.\data\extracted\20021010_spam\spam\
'charmap' codec can't decode byte 0x81 in position 3082: character maps to <undefined>: Error: Can't read file .\data\extracted\20021010_spam\spam\0123.68e87f8b736959b1ab5c4b5f2ce7484a
list index out of range: Error: Can't read file .\data\extracted\20021010_spam\spam\0255.42a6feb4435a0a68929075c0926f085d
'charmap' codec can't decode byte 0x81 in position 2588: character maps to <undefined>: Error: Can't read file .\data\extracted\20021010_spam\spam\0273.51c482172b47ce926021aa7cc2552549
'charmap' codec can't decode byte 0x81 in position 2503: character maps to <undefined>: Error: Can't read file .\data\extracted\20021010_spam\spam\0330.a4df526233e524104c3b3

In [149]:
base_email_df

Unnamed: 0,index,target,From:,To:,Subject:,MIME-Version:,Content-Type:,Precedence:,body,X-Mailer:,Content-Transfer-Encoding:
0,0001.ea7e79d3153e7469e7a9c3e0af6a357e,HAM,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,1.0,text/plain; charset=us-ascii,bulk,In-Reply-To: <1029945287.4797.TMDA@deepeddy.vi...,,
1,0002.b3120c4bcbf3101e661161ee7efcb8bf,HAM,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,1.0,text/plain; charset=US-ASCII,bulk,Reply-To: zzzzteana@yahoogroups.com\nContent-T...,Internet Mail Service (5.5.2653.19),7bit
2,0003.acfc5ad94bbd27118a0d8685d18c89dd,HAM,"""Tim Chapman"" <timc@2ubh.com>",zzzzteana <zzzzteana@yahoogroups.com>,[zzzzteana] Moscow bomber,1.0,text/plain; charset=US-ASCII,bulk,Reply-To: zzzzteana@yahoogroups.com\nContent-T...,Microsoft Outlook Express Macintosh Edition - ...,7bit
3,0004.e8d5727378ddde5c3be181df593f1712,HAM,Monty Solomon <monty@roscom.com>,undisclosed-recipient: ;,[IRR] Klez: The Virus That Won't Die,1.0,"text/plain; charset=""us-ascii""",bulk,Sender: irregulars-admin@tb.tf\nErrors-To: irr...,,
4,0005.8c3b9e9c0f3f183ddaf7592a11b99957,HAM,Tony Nugent <tony@linuxworks.com.au>,Exmh Users Mailing List <exmh-users@example.com>,Re: Insert signature,,,bulk,X-Loop: exmh-users@example.com\nSender: exmh-u...,nmh-1.0.4 exmh-2.4,
...,...,...,...,...,...,...,...,...,...,...,...
9345,01396.e80a10644810bc2ae3c1b58c5fd38dfa,SPAM,Professional_Career_Development_Institute@Frug...,yyyy@netnoteinc.com,Busy? Home Study Makes Sense!,,text/html,,Id-Frugaljoe: yyyy####netnoteinc.com\nDate: Tu...,,
9346,01397.f75f0dd0dd923faefa3e9cc5ecb8c906,SPAM,"""IQ - TBA"" <tba@insiq.us>",<yyyy@spamassassin.taint.org>,Preferred Non-Smoker Rates for Smokers,1.0,text/html;,,"To: <yyyy@spamassassin.taint.org>\nDate: Tue, ...",Microsoft CDO for Windows 2000,quoted-printable
9347,01398.8ca7045aae4184d56e8509dc5ad6d979,SPAM,Mike <raye@yahoo.lv>,Mailing.List@user2.pro-ns.net,"How to get 10,000 FREE hits per day to any web...",,"text/plain; charset=""iso-8859-1""",,Sender: Mike <raye@yahoo.lv>\nMime-Version: 1....,Microsoft Outlook Build 10.0.2616,
9348,01399.2319643317e2c5193d574e40a71809c2,SPAM,"""Mr. Clean"" <cweqx@dialix.oz.au>",<Undisclosed.Recipients@webnote.net>,Cannabis Difference,1.0,text/plain;,,"Date: Wed, 05 Aug 2020 04:01:50 -1900\nMIME-Ve...",,7bit


In [215]:
cleansed_email_df = deepcopy(base_email_df)

In [170]:
cleansed_email_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9350 entries, 0 to 9349
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   index                       9350 non-null   object
 1   target                      9331 non-null   object
 2   From:                       9329 non-null   object
 3   To:                         9006 non-null   object
 4   Subject:                    9322 non-null   object
 5   MIME-Version:               6208 non-null   object
 6   Content-Type:               8052 non-null   object
 7   Precedence:                 5304 non-null   object
 8   body                        9331 non-null   object
 9   X-Mailer:                   3650 non-null   object
 10  Content-Transfer-Encoding:  4604 non-null   object
dtypes: object(11)
memory usage: 803.6+ KB


In [211]:
def rename_columns_remove_colon_from_column_name(df):
    """
    Function to remove the colon from the column headers and force the text to lowercase
    Input:
        df: the target dataframe
    Returns:
        df: df with renamed columns
    """
    df.columns = [x.replace(":", "").lower() for x in df.columns]
    return df

In [212]:
def extract_email_components_to_features(df, user_types):
    """
    Function to extract components of the to & from columns to new features
        fullname: the fullname of the sender that prefixs the email address
        email: the full email address contained in '<email_address>'
        username: the username from the email address (everything before @)
        domain: the domain of the email address (everything after @)
    Inputs:
        df: the target dataframe
        user_types: list of types of user that will be processed i.e. ['to'], ['from'], ['to', 'from']
    Returns:
        df: df with additional features added.
    """
    for user_type in user_types:
        # Split the FROM column into full name, username and domain
        df[str(user_type + '_fullname')] = df[str(user_type)].str.split("<", n = 1).str[0].str.replace('"', "")

        # Extract the from email
        df[str(user_type + '_email')] = df[str(user_type)].str.split("<").str[1].str.replace('>', "")

        # Extract the from username
        df[str(user_type + '_username')] = df[str(user_type + '_email')].str.extract(r'(.*)[@]')

        # Extract the from domain
        df[str(user_type + '_domain')] = df[str(user_type + '_email')].str.extract(r'[@](.*)')
    
    return df

In [216]:
# try functions
cleansed_email_df = rename_columns_remove_colon_from_column_name(cleansed_email_df)
cleansed_email_df = extract_email_components_to_features(cleansed_email_df, ['to', 'from'])

In [217]:
cleansed_email_df.head()

Unnamed: 0,index,target,from,to,subject,mime-version,content-type,precedence,body,x-mailer,content-transfer-encoding,to_fullname,to_email,to_username,to_domain,from_fullname,from_email,from_username,from_domain
0,0001.ea7e79d3153e7469e7a9c3e0af6a357e,HAM,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,1.0,text/plain; charset=us-ascii,bulk,In-Reply-To: <1029945287.4797.TMDA@deepeddy.vi...,,,Chris Garrigues,cwg-dated-1030377287.06fa6d@DeepEddy.Com,cwg-dated-1030377287.06fa6d,DeepEddy.Com,Robert Elz,kre@munnari.OZ.AU,kre,munnari.OZ.AU
1,0002.b3120c4bcbf3101e661161ee7efcb8bf,HAM,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,1.0,text/plain; charset=US-ASCII,bulk,Reply-To: zzzzteana@yahoogroups.com\nContent-T...,Internet Mail Service (5.5.2653.19),7bit,'zzzzteana@yahoogroups.com',zzzzteana@yahoogroups.com,zzzzteana,yahoogroups.com,Steve Burt,Steve_Burt@cursor-system.com,Steve_Burt,cursor-system.com
2,0003.acfc5ad94bbd27118a0d8685d18c89dd,HAM,"""Tim Chapman"" <timc@2ubh.com>",zzzzteana <zzzzteana@yahoogroups.com>,[zzzzteana] Moscow bomber,1.0,text/plain; charset=US-ASCII,bulk,Reply-To: zzzzteana@yahoogroups.com\nContent-T...,Microsoft Outlook Express Macintosh Edition - ...,7bit,zzzzteana,zzzzteana@yahoogroups.com,zzzzteana,yahoogroups.com,Tim Chapman,timc@2ubh.com,timc,2ubh.com
3,0004.e8d5727378ddde5c3be181df593f1712,HAM,Monty Solomon <monty@roscom.com>,undisclosed-recipient: ;,[IRR] Klez: The Virus That Won't Die,1.0,"text/plain; charset=""us-ascii""",bulk,Sender: irregulars-admin@tb.tf\nErrors-To: irr...,,,undisclosed-recipient: ;,,,,Monty Solomon,monty@roscom.com,monty,roscom.com
4,0005.8c3b9e9c0f3f183ddaf7592a11b99957,HAM,Tony Nugent <tony@linuxworks.com.au>,Exmh Users Mailing List <exmh-users@example.com>,Re: Insert signature,,,bulk,X-Loop: exmh-users@example.com\nSender: exmh-u...,nmh-1.0.4 exmh-2.4,,Exmh Users Mailing List,exmh-users@example.com,exmh-users,example.com,Tony Nugent,tony@linuxworks.com.au,tony,linuxworks.com.au
