# Spam Filter (Exercise 3.4)

A spam/ham classifier based on the provided solutions.

In [35]:
import tarfile
from pathlib import Path
import urllib.request

import email
import email.policy

## Loading Data

In [38]:
def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
                 
    files = (("easy_ham", "20030228_easy_ham"),
             ("easy_ham_2", "20030228_easy_ham_2"), 
             ("hard_ham", "20030228_hard_ham"),
             ("spam", "20030228_spam"),
             ("spam_2", "20030228_spam_2"))
    
    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
                 
    for name, filename in files:
        url = spam_root + filename + ".tar.bz2"
        
        if not (spam_path / name).is_dir():
            path = (spam_path / name).with_suffix(".tar.bz2")
            print("Downloading", url, "to", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / name for name in [f[0] for f in files]]

In [39]:
dirs = fetch_spam_data()

In [40]:
def get_emails(dirs):
    emails = []
    for d in dirs:
        filenames = [f for f in sorted(d.iterdir()) if len(f.name) > 20]
        for filepath in filenames:
            with open(filepath, "rb") as f:
                emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    return emails

In [41]:
ham_emails = get_emails(dirs[:3])
spam_emails = get_emails(dirs[3:])