In [1]:
import os 
import tarfile 
import urllib.request 

In [2]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
ham_Url = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
Spam_Url = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
Spam_Path = os.path.join("datasets", "spam")

In [3]:
# Download examples of spam and ham from Apache SpamAssassin’s public datasets
# Unzip the datasets
def fetch_spam_data(ham_url = ham_Url, spam_url = Spam_Url, spam_path=Spam_Path):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in ( ("ham.tar.bz2", ham_url), ("spam.tar.biz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [4]:
fetch_spam_data()

In [5]:
HAM_DIR = os.path.join(Spam_Path, "easy_ham")
SPAM_DIR = os.path.join(Spam_Path, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [6]:
import email
import email.policy

def load_email(is_spam, filename, spam_path = Spam_Path):
    directory = "spam"  if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_emails = [load_email(is_spam = False, filename = name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [8]:
# Get email structure: emails may contain multipart with images and attachments
def get_email_structure(email):
    if isinstance(email, str):
        payload =email.get_payload()
        if isinstance(payload, list):
            return "multipart({})".format(",".join([get_email_structure(sub_email)
                                                    for sub_email in payload
                                                   ]))
        else:
            return email.get_content_type()        

In [9]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures
    
structures_counter(ham_emails).most_common()

[(None, 2500)]

In [10]:
structures_counter(spam_emails).most_common()

[(None, 500)]

In [11]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X

array([<email.message.EmailMessage object at 0x000001C2C48DFA48>,
       <email.message.EmailMessage object at 0x000001C2C48DF7C8>,
       <email.message.EmailMessage object at 0x000001C2C48DF788>, ...,
       <email.message.EmailMessage object at 0x000001C2C6813648>,
       <email.message.EmailMessage object at 0x000001C2C6822948>,
       <email.message.EmailMessage object at 0x000001C2C68275C8>],
      dtype=object)