In [389]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

import scipy.io as sio
from sklearn import svm, datasets

import re
import pprint
from stemming.porter2 import stem

sns.set_context('notebook')
sns.set_style('white')
#plt.style.use('fivethirtyeight')
plt.xkcd() # because why not?
np.set_printoptions(precision=3)

ansi = {'underline': '\033[4m', 'bold': '\033[1m', 'end':'\033[0m'}

In [390]:
def processEmail(email, vocab):
    # Compile regexes
    html_regex = re.compile('<[^<>]+>') # Strip html
    num_regex = re.compile('[0-9]+')
    url_regex = re.compile(r'(http|https)://[^\s]*')
    email_regex = re.compile('[^\s]+@[^\s]+')
    dollar_regex = re.compile('[$]+')
    
    email = email.lower()
    email = email.replace('\n', ' ')
    
    email = html_regex.sub(' ', email)
    email = num_regex.sub('number', email)
    email = url_regex.sub('httpaddr', email)
    email = email_regex.sub('emailaddr', email)
    email = dollar_regex.sub('dollar', email)
    
    # Remove punctuation
    for c in r"""@$/#.-:&*+=[]?!(){},''">_<;√""":
        email = email.replace(c, ' ')
    
    email = re.sub('\s+', ' ', email).strip() # delete extra spaces and strip
    email = email.split(' ') # split by spaces
    email = [stem(x) for x in email] # stem words in email
    
    word_indices = []
    length = 0 # for printing
    for word in email:
        index = vocab[vocab['words'] == word].index.tolist()
        if index:
            word_indices.append(index[0])
        
        # Printing
        if length + len(word) + 1 > 78:
            print()
            length = 0
        print('{} '.format(word), end='')
        length += len(word) + 1
    print('\n')
    return word_indices

In [427]:
def emailFeatures(word_indices, vocab):
    """List comprehension that maps every index in word_indices to its corresponding feature"""
    return [1 if index in word_indices else 0 for index in range(len(vocab))]

In [392]:
email_1 = open('emailSample1.txt', 'r').read()
vocab = pd.read_table('vocab.txt',usecols=[1], names=['words'])
pp = pprint.PrettyPrinter(compact=True)
vocab.head()

Unnamed: 0,words
0,aa
1,ab
2,abil
3,abl
4,about


In [428]:
index_list = processEmail(email_1, vocab)
features = emailFeatures(index_list, vocab)
pp.pprint(index_list)

anyon know how much it cost to host a web portal well it depend on how mani 
visitor you re expect this can be anywher from less than number buck a month 
to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb 
if your run someth big to unsubscrib yourself from this mail list send an 
email to emailaddr 

[85, 915, 793, 1076, 882, 369, 1698, 789, 1821, 1830, 882, 430, 1170, 793, 1001,
 1892, 1363, 591, 237, 161, 88, 687, 944, 1662, 1119, 1061, 1698, 374, 1161,
 478, 1892, 1509, 798, 1181, 1236, 809, 1894, 1439, 1546, 180, 1698, 1757, 1895,
 687, 991, 960, 1476, 70, 529, 1698, 530]


In [394]:
# Linear SVM for spam classification
data = sio.loadmat('spamTrain.mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [395]:
X = data['X']
y = data['y'].flatten()

C = 0.1

In [396]:
model = svm.LinearSVC(C=C)
model.fit(X, y.flatten())
pred = model.predict(X)

accuracy = np.mean(pred == y) * 100
print('Training accuracy: {0:0.3g}'.format(accuracy))

Training accuracy: 100


In [397]:
data = sio.loadmat('spamTest.mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Xtest', 'ytest'])

In [398]:
Xtest = data['Xtest']
ytest = data['ytest'].flatten()

In [399]:
pred = model.predict(Xtest)
accuracy = np.mean(pred == ytest) * 100
print('Testing accuracy: {0:0.3g}'.format(accuracy))

Testing accuracy: 99.2


In [426]:
weights = model.coef_.flatten()
sorted_indices = weights.argsort()[::-1][:15] # reverse sorted arguments

top_spam = [[vocab['words'][index], weights[index]] for index in sorted_indices]

print('\n * Top predictors of spam * \n')
print('[Word, weight]')
for item in top_spam: print(item)


 * Top predictors of spam * 

[Word, weight]
['our', 0.42166503378416087]
['remov', 0.38717331460155796]
['click', 0.38705975111005148]
['basenumb', 0.34661730599105123]
['guarante', 0.34168564839618792]
['visit', 0.30302776748431892]
['bodi', 0.26352346102140922]
['will', 0.24439387202385113]
['numberb', 0.2387949575971757]
['price', 0.23419901984690505]
['dollar', 0.23231488916063356]
['nbsp', 0.22708090209722948]
['below', 0.22319913379805367]
['lo', 0.21999374211135345]
['most', 0.21454830433389704]
