**Build a spam classificator (a more challenging exercise):**
- **[X] Download spam examples and standart e-mails of public datasets from Apache SpamAssassin (https://spamassassin.apache.org/old/publiccorpus/);**
- **[X] Unzip the datasets and try to get familiarized with the data format;**
- **[X] Split the datasets in a training set and a test set**
- **[ ] Write a data preparation pipeline to convert each e-mail in a vector of characteristics. Your preparation pipeline should transform an e-mail to a vector (sparse) that indicates the presence or not of each possible word. For example, if all e-mails have only four words, 'Hello', 'how', 'are', 'you', then the e-mail 'Hello you Hello Hello you' would be converted to a vector [1, 0, 0, 1] (meaning that 'Hello' is present, 'how' is absent, 'are' is absent and 'you' is present), or [3, 0, 0, 2] if you prefer to count the number of occurences of each word;**
- **[X] Maybe you want to add hyperparameters to your preparation pipeline to control wether or not to remove the headers of e-mails, convert each e-mail to lowercase, remove ponctuation, replace all URLs to "URL", replace all numbers to "NUMBER", or even reduce, that is, remove word endings. There are libraries in Python availble to do that.**

- **[ ] Following, try many classifiers and see if you can build a good spam classifier with high revocation and precision.** 

**I will try to get more than 97% recall**

In [1]:
# Load data
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_FILE = '20021010_spam.tar.bz2'
HAM_FILE = '20021010_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + SPAM_FILE
HAM_URL = DOWNLOAD_ROOT + HAM_FILE
PATH = os.path.join("datasets", "spam")

def fetch_data(spam_url=SPAM_URL, path=PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename, url in (('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)):
        path = os.path.join(PATH, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=PATH)
        tar_bz2_file.close()

fetch_data()

In [2]:
# Take a look in the files
file = open('./datasets/spam/easy_ham/0001.ea7e79d3153e7469e7a9c3e0af6a357e', 'r')
print(file.read()[:500], '[...]')

From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002
Return-Path: <exmh-workers-admin@example.com>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received: from listman.e [...]


In [3]:
# Split e-mails
import email
import email.policy

HAM_DIR = os.path.join(PATH, "easy_ham")
SPAM_DIR = os.path.join(PATH, "spam")
ham_filenames = [i for i in os.listdir(HAM_DIR)]
spam_filenames = [i for i in os.listdir(SPAM_DIR)]

def load_email(is_spam, filename, spam_path=PATH):
    directory = "spam" if is_spam else "easy_ham"
    f = open(os.path.join(spam_path, directory, filename), "rb")
    return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [4]:
print('Ham files:', len(ham_emails),'. Spam files:', len(spam_emails))

Ham files: 2551 . Spam files: 501


In [5]:
# Delete multipart
ham_emails = [i for i in ham_emails if i.is_multipart()==False]
spam_emails = [i for i in spam_emails if i.is_multipart()==False]

In [6]:
# Split train and test set
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([i for i in (ham_emails+spam_emails)])
y = np.concatenate((np.ones(len(ham_emails)), np.zeros(len(spam_emails))))
test_size = len(spam_emails)/len(ham_emails)

X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=test_size)

In [7]:
# Return text from e-mail
def to_text(text):
    return str(text.get_payload())

In [8]:
# Transform to lowercase
# Replace urls to 'URL'
# Replace numbers to 'NUMBER'
# Remove '\n'
# Remove punctuation
import re
import string

def text_transform(t):
    t = t.lower()
    t = re.sub(r'http\S+', 'URL', t)
    t = re.sub(r'www\S+', 'URL', t)
    t = re.sub(r'\d\S+', 'NUMBER', t)
    t = re.sub(r'\n', ' ', t)
    t = t.translate(str.maketrans(' ', ' ', string.punctuation))
    return t

In [9]:
# Create sparse vector
from sklearn.feature_extraction.text import CountVectorizer

def to_vector(text):
    text=[text]
    vectrans = CountVectorizer()
    vectrans.fit(text)
    vector = vectrans.transform(text)
    return vector.toarray()[0]

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from email import parser

class emailToString(BaseEstimator, TransformerMixin, parser.BytesParser):
    def __init__(self):
        self.X = None
    
    def fit(self, emails):
        X_train_fitted = []
        for email in emails:
            text = to_text(email)
            text = text_transform(text)
            X_train_fitted.append(text)
        X_train_fitted = np.array(X_train_fitted)
        return X_train_fitted
        
    def transform(self, emails):
        X_train_transformed = []
        for email in emails:
            vector = to_vector(email)
            X_train_transformed.append(vector)
        X_train_transformed = np.array(X_train_transformed)
        return X_train_transformed

In [11]:
# Build pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('text_transform', emailToString()),
])

In [12]:
X_train_new = pipeline.fit(X_train)

TypeError: fit() takes 2 positional arguments but 3 were given