**Build a spam classificator (a more challenging exercise). Steps:**
- **[X] Download spam examples and standart e-mails of public datasets from Apache SpamAssassin (https://spamassassin.apache.org/old/publiccorpus/);**
- **[X] Unzip the datasets and try to get familiarized with the data format;**
- **[X] Split the datasets in a training set and a test set**
- **[X] Write a data preparation pipeline to convert each e-mail in a vector of characteristics. Your preparation pipeline should transform an e-mail to a vector (sparse) that indicates the presence or not of each possible word. For example, if all e-mails have only four words, 'Hello', 'how', 'are', 'you', then the e-mail 'Hello you Hello Hello you' would be converted to a vector [1, 0, 0, 1] (meaning that 'Hello' is present, 'how' is absent, 'are' is absent and 'you' is present), or [3, 0, 0, 2] if you prefer to count the number of occurences of each word;**
- **[X] Maybe you want to add hyperparameters to your preparation pipeline to control wether or not to remove the headers of e-mails, convert each e-mail to lowercase, remove ponctuation, replace all URLs to "URL", replace all numbers to "NUMBER", or even reduce, that is, remove word endings. There are libraries in Python availble to do that.**

- **[X] Following, try Logistic Regressor and see if you can build a good spam classifier with high revocation and precision.** 

**I will try to get more than 97% recall**

In [1]:
# Load data
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_FILE = '20021010_spam.tar.bz2'
HAM_FILE = '20021010_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + SPAM_FILE
HAM_URL = DOWNLOAD_ROOT + HAM_FILE
PATH = os.path.join("datasets", "spam")

def fetch_data(spam_url=SPAM_URL, path=PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename, url in (('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)):
        path = os.path.join(PATH, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=PATH)
        tar_bz2_file.close()

fetch_data()

In [2]:
# Take a look in the files
file = open('./datasets/spam/easy_ham/0001.ea7e79d3153e7469e7a9c3e0af6a357e', 'r')
print(file.read()[:500], '[...]')

From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002
Return-Path: <exmh-workers-admin@example.com>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received: from listman.e [...]


In [3]:
# Split e-mails
import email
import email.policy

HAM_DIR = os.path.join(PATH, "easy_ham")
SPAM_DIR = os.path.join(PATH, "spam")
ham_filenames = [i for i in os.listdir(HAM_DIR)]
spam_filenames = [i for i in os.listdir(SPAM_DIR)]

def load_email(is_spam, filename, spam_path=PATH):
    directory = "spam" if is_spam else "easy_ham"
    f = open(os.path.join(spam_path, directory, filename), "rb")
    return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [4]:
print('Ham files:', len(ham_emails),'. Spam files:', len(spam_emails))

Ham files: 2551 . Spam files: 501


In [5]:
# Delete multipart
ham_emails = [i for i in ham_emails if i.is_multipart()==False]
spam_emails = [i for i in spam_emails if i.is_multipart()==False]

In [6]:
# Split train and test set
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([i for i in (ham_emails+spam_emails)])
y = np.concatenate((np.ones(len(ham_emails)), np.zeros(len(spam_emails))))
test_size = len(spam_emails)/len(ham_emails)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

In [7]:
class emailHandler():
    def __init__(self, email_list):        
        import email
        
        ty = type(email_list)
        tyem = type(email.message.EmailMessage())
        # If email_list is not a list or array...
        if not (ty == np.ndarray or ty == list):
            raise Exception('Object type not supported. Please pass a list or numpy array.')
        # If objects in email_list are not emails...
        elif not (type(email_list[0]) == tyem):
            raise Exception('Please pass a list or array of email.message.EmailMessage objects.',
                            'Use the email library to transform your objects.')
        else:
            # Define list of e-mails
            self.email_list = email_list
    
    # Take a list of email objects and transform in a list of email texts
    def create_email_list(self):
        import re
        import string
        
        # Return text from e-mail
        def to_text(text):
            return str(text.get_payload())

        # Transform to lowercase, replace urls to 'URL', replace numbers to 'NUMBER'
        # Remove '\n', remove punctuation
        def text_transform(t):
            t = t.lower()
            t = re.sub(r'http\S+', 'URL', t)
            t = re.sub(r'www\S+', 'URL', t)
            t = re.sub(r'\d\S+', 'NUMBER', t)
            t = re.sub(r'\n', ' ', t)
            t = t.translate(str.maketrans(' ', ' ', string.punctuation))
            return t

        # Return array of e-mails texts
        
        X_train_fitted = []
        for email in self.email_list:
            text = to_text(email)
            text = text_transform(text)
            X_train_fitted.append(text)
        return np.array(X_train_fitted)
    
    # Create word vocabulary
    def make_vocabulary(self):
        all_strings = self.create_email_list()
        self.vocabulary = []
        for i in range(len(all_strings)):
            words_in_string = all_strings[i].split()
            for word in words_in_string:
                if word not in self.vocabulary:
                    self.vocabulary.append(word)
        return self.vocabulary
    
    # Transform each email in a vector, where each instance is the number of how many times the word
    # appears in that email. The index for each word is stablished by the make_vocabulary() function
    def create_vector(self, vocabulary):
        all_strings = self.create_email_list()
        X_all = []
        for email in all_strings:
            words_in_email = []
            for word in vocabulary:
                words_in_email.append(email.count(word))
            X_all.append(words_in_email)
        return np.array(X_all)
    
    def fit(self, X=0, y=0):
        self.make_vocabulary()
        pass
    
    def transform(self):
        vectors = self.create_vector(self.vocabulary)
        return vectors

In [8]:
# Create list of vectors

# eH has training set emails
eH = emailHandler(X_train)
eH.fit()
X_train_transformed = eH.transform()
X_train_transformed

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
# Apply Logistic Regression on training set
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(solver="liblinear", class_weight='balanced', max_iter=100000)
score = cross_val_score(log_reg, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.981, total=   6.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.3s remaining:    0.0s


[CV] .................................... , score=0.985, total=   4.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.7s remaining:    0.0s


[CV] .................................... , score=0.986, total=   3.9s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.6s finished


0.9840749870948032

In [10]:
# Apply on test set

# Define vocabulary from both X_test and X_train
voc_eH = emailHandler(X)
vocabulary = voc_eH.make_vocabulary()

# Define X_test vector
test_eH = emailHandler(X_test) 
X_test_transformed = test_eH.create_vector(vocabulary) # Create a vector with general vocabulary
cross_val_score(log_reg, X_test_transformed, y_test, cv=3, verbose=3).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.981, total=   0.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] .................................... , score=0.949, total=   0.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] .................................... , score=0.955, total=   0.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s finished


0.9617834394904459

In [11]:
# Check precision and recall score
from sklearn.metrics import precision_score, recall_score

# Use general vocabulary
X_train_transformed = eH.create_vector(vocabulary)

log_reg.fit(X_train_transformed, y_train)
y_pred = log_reg.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 99.26%
Recall: 99.01%
