In [475]:
"""
Reads 6,000+ emails from the Spamassassin Public Corpus
and predicts spam emails through a linear SVM classifier
"""

%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split

import re
import os
import chardet
from collections import defaultdict
from stemming.porter2 import stem

np.set_printoptions(precision=3)
ansi = {'underline': '\033[4m', 'bold': '\033[1m', 'end':'\033[0m'}

In [476]:
def processEmail(email, regexes):
    """Processes email through (pre-processed) regexes and stemming"""
    
    email = email.lower()
    email = email.replace('\n', ' ')
    
    email = regexes['html'].sub(' ', email)
    email = regexes['num'].sub('number', email)
    email = regexes['url'].sub('httpaddr', email)
    email = regexes['email'].sub('emailaddr', email)
    email = regexes['dollar'].sub('dollar', email)
    
    # Remove punctuation
    for c in r"""@$/#%.-^:&*+=[]?!(){},''">_<;√""":
        email = email.replace(c, ' ')
    
    email = re.sub('\s+', ' ', email).strip() # delete extra spaces and strip
    email = email.split(' ') # split by spaces
    # stem words in email
    email = [stem(x) for x in email]
    return email

In [575]:
def makeVocabulary(datasets_dir, regexes):
    """Generates a  of the most frequent words in the dataset"""
    
    vocab = defaultdict(int) # Keep track of frequency of words
    emails = [] # List for easy translation into DataFrame

    for dirpath, _, files in os.walk(datasets_dir):
        dirname = os.path.basename(os.path.normpath(dirpath))
        print('', end='\r')
        for file in files:
            print('Training file {} in {}'.format(file.split('.', 1)[0], dirname), end='\r')
            if file == '.DS_Store': continue # mac OS folder
            #email_bit = open(os.path.join(dirpath, file), 'rb').read()
            #encoding = chardet.detect(email_bit)['encoding']
            email = open(os.path.join(dirpath, file), 'r', errors='ignore').read()
            email = email.split('\n\n', 1)[-1] # Remove header by splitting at first \n\n
            
            # Add words to vocabulary
            word_list = processEmail(email, regexes)
            for word in word_list:
                vocab[word] += 1
            # Add email information to emails, but convert word_list to a set for efficiency
            emails.append({'contents': email,
                           'sets': set(word_list),
                           'spam': 'spam' in dirname,
                           'category': dirname})

    # Convert to DataFrames
    vocab = pd.DataFrame([[item, vocab[item]] for item in vocab], columns=['words', 'frequency'])
    vocab = df.sort_values(by='frequency', ascending=False)
    vocab = df.drop(df[df['frequency'] < 5].index) # Leaves us with around 12580 features
    emails = pd.DataFrame(emails, columns=['contents', 'sets', 'spam', 'category'])
    return vocab, emails

In [576]:
# Map email contents to indices and vocabulary
def emailFeatures(word_set, vocab_list):
    """List comprehension that maps every word in word_list to its corresponding feature in vocab_list"""
    return [1 if word in word_set else 0 for word in vocab_list]

In [633]:
def randomEmail(emails, model):
    """Selects a random email from emails and predicts it using model"""
    sample = emails.sample()
    original_message = sample['contents'].item()
    y = sample['spam'].item()
    category = sample['category'].item()
    pred = model.predict(sample['X'].tolist()).item()

    print('{0:.750}...\n\nCorrect: {1}\nPredicted: {2}\nCategory: {3}'.format(original_message, y, pred, category))

In [583]:
cwd = os.getcwd()
datasets_dir = os.path.join(cwd, 'datasets')

In [584]:
regexes = {'html' : re.compile('<[^<>]+>'),
           'num' : re.compile('[0-9]+'),
           'url' : re.compile(r'(http|https)://[^\s]*'),
           'email' : re.compile('[^\s]+@[^\s]+'),
           'dollar' : re.compile('[$]+')}

In [585]:
vocab, emails = makeVocabulary(datasets_dir, regexes)

Training file cmds in spam_22mm22

In [586]:
vocab.head()

Unnamed: 0,words,frequency
2,number,109802
27,the,68770
111,to,48595
78,and,36129
105,a,33696


In [593]:
emails.tail()

Unnamed: 0,contents,sets,spam,category,X
4412,Dear zzzz =2C\n\n=3CBODY bgColor=3D#ffccff=3E\...,"{ffnumber, want, disconnect, power, then, comp...",True,spam,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ..."
954,The structure of the Internet has never\nbeen ...,"{comput, friend, indistinguish, you, see, less...",False,easy_ham,"[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, ..."
3951,mv 00001.d4365609129eef855bd5da583c90552b 0000...,{enumberenumberfanumberdnumberdnumberanumberbn...,False,easy_ham_2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4954,"<!doctype html public ""-//w3c//dtd html 4.0 tr...","{effect, other, target, mailt, futur, dure, mi...",True,spam_2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
569,"ok i read back, thats not a typo, you mean thr...","{mention, friend, you, get, cdale, kind, see, ...",False,easy_ham,"[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, ..."


In [588]:
# Create features column 'X'
vocab_list = vocab['words'].tolist()
emails['X'] = emails['sets'].map(lambda x: emailFeatures(x, vocab_list))

# Split data into Train and Test 70 - 30
emails, emails_test = train_test_split(emails, test_size=0.3, train_size=0.7)

# Linear SVM Classifier

model = svm.LinearSVC()
model.fit(emails['X'].tolist(), emails['spam'].values)

pred = model.predict(np.array(emails_test['X'].tolist()))
print('Accuracy on Test Set: {0:0.3g}'.format(np.mean(pred == emails_test['spam'])))

In [591]:
# Inspect weights

weights = model.coef_.flatten()
sorted_indices = weights.argsort()[::-1][:15] # reverse sorted arguments

top_spam = [[vocab_list[index], weights[index]] for index in sorted_indices]

print('\n * Top predictors of spam * \n')
print('[Word, weight]')
for item in top_spam: print('{} - {}'.format(item[0], item[1]))



 * Top predictors of spam * 

[Word, weight]
click - 0.7834592801596509
sweeti - 0.49922664489090735
sight - 0.47278284595683834
shave - 0.4519259615603336
supplement - 0.39319310605931257
tabl - 0.3892073666294188
our - 0.370187236803594
freebsd - 0.3528887629129025
z - 0.34454862025197125
v - 0.33857483753217993
exit - 0.3211763694838365
deathtospamdeathtospamdeathtospam - 0.32022500235927626
basenumb - 0.3096730961328863
pleas - 0.2988402617288223
remov - 0.29253102607091813


In [661]:
# Select a random email and predict it
randomEmail(emails, model)

Dear fellow eBay user,

I listed this CD on eBay a few months ago and here's 
what happened.  I got an email from Safeharbor saying 
that all my auctions had been cancelled and that the CD 
was permanently "banned" from being sold on eBay.  From 
then on, I called it the "Banned CD"!

So why did eBay ban it?  Maybe they figured you shouldn't 
have access to this type of information, or maybe they 
didn't think we could cram all of these programs onto 
one CD Rom.  I'll let you decide.

This CD will teach you things that eBay, Uncle Sam, and 
others just don't want you to know.  I am not responsible 
for how you use some of this information and it is 
provided for educational purposes only.  Here are just a 
few of the things you will learn ...

Correct: True
Predicted: True
Category: spam_2
