# Spam Filter (Exercise 3.4)

A spam/ham classifier based on the provided solutions.

In [116]:
import numpy as np
import pandas as pd

#downloading tar files
import tarfile
from pathlib import Path
import urllib.request

#emails
import email
import email.policy

#parsing HTML
from bs4 import BeautifulSoup
from html import unescape

#processing data
import nltk
import urlextract
import re


#sklearn
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

## Loading Data

We load in `20030228_easy_ham`, `20030228_easy_ham_2`, `20030228_hard_ham`, `20030228_spam` and `20030228_spam_2` from [spamassassin.apache.org/old/publiccorpus/](https://spamassassin.apache.org/old/publiccorpus/) which is a combined 4150 ham emails and 1897 spam emails with a variety of difficulty.

In [45]:
def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
                 
    files = (("easy_ham", "20030228_easy_ham"),
             ("easy_ham_2", "20030228_easy_ham_2"), 
             ("hard_ham", "20030228_hard_ham"),
             ("spam", "20030228_spam"),
             ("spam_2", "20030228_spam_2"))
    
    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
                 
    for name, filename in files:
        url = spam_root + filename + ".tar.bz2"
        
        if not (spam_path / name).is_dir():
            path = (spam_path / name).with_suffix(".tar.bz2")
            print("Downloading", url, "to", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / name for name in [f[0] for f in files]]

In [46]:
def get_emails(dirs):
    emails = []
    for d in dirs:
        filenames = [f for f in sorted(d.iterdir()) if len(f.name) > 20]
        for filepath in filenames:
            with open(filepath, "rb") as f:
                emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    return emails

In [47]:
dirs = fetch_spam_data()

ham_emails = get_emails(dirs[:3])
spam_emails = get_emails(dirs[3:])

### Train/Test Split

In [50]:
X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=93)

## Preprocessing Pipeline

### Parsing Emails Structure

We create a function which iterates through the parts of an email and returns the plain text from any plain text or html (parsed with beautiful soup) parts.

In [63]:
def email_to_text(email):
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            soup = BeautifulSoup(content, 'html.parser')
            text = soup.get_text(separator='\n', strip=True)
            return unescape(text)

#### Viewing email object details

In [106]:
def email_info(email, spam):
    print("\033[1mSpam:\033[0m", bool(spam))
    print("\033[1mHas HTML:\033[0m", any(part.get_content_type() == "text/html" for part in email.walk()))
    print("\033[1mFrom:\033[0m", email["From"])
    print("\033[1mSubject:\033[0m", email["Subject"])
    print("\u2594" * 120)
    print(email_to_text(email))

In [115]:
email_info(X[4156], y[4156])

[1mSpam:[0m True
[1mHas HTML:[0m False
[1mFrom:[0m fort@bluemail.dk
[1mSubject:[0m FORTUNE 500 COMPANY HIRING, AT HOME REPS.
▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40




### Custom Transformers

How to process the email data:
- Transform content of email to plain text
- Transform plain text for Word Counter
    - lower case
    - replace urls with "URL"
    - replace numbers with "NUMBER"
    - remove any punctuation
    - remove stop words
    - perform stemming to group similar words
- Count occurences of each word in text content
- Create a 0/1 binary tag showing if an email contains html
- Vectorise data with a sparse matrix

Throughout, variables that are made as a choice should be made accessible to be tuned during hyper-parameter tuning.