In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from string import lower
import re
from porterStemmer import porterStemmer
from sklearn import svm
%matplotlib inline

In [2]:
file = open('data/emailSample1.txt', 'r')
file_contents = file.readlines()
file_contents

['> Anyone knows how much it costs to host a web portal ?\n',
 '>\n',
 "Well, it depends on how many visitors you're expecting.\n",
 'This can be anywhere from less than 10 bucks a month to a couple of $100. \n',
 'You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \n',
 'if youre running something big..\n',
 '\n',
 'To unsubscribe yourself from this mailing list, send an email to:\n',
 'groupname-unsubscribe@egroups.com\n',
 '\n']

In [3]:
def getVocabList():
    with open('data/vocab.txt') as f:
        vocabList = []
        for line in f:
            idx, w = line.split()
            vocabList.append(w)
    return vocabList
getVocabList()[:5]

['aa', 'ab', 'abil', 'abl', 'about']

In [4]:
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []
    email_contents = lower(email_contents)
    
    rx = re.compile('<[^<>]+>|\n')
    email_contents = rx.sub(' ', email_contents)
    
    rx = re.compile('[0-9]+')
    email_contents = rx.sub('number ', email_contents)
    
    rx = re.compile('(http|https)://[^\s]*')
    email_contents = rx.sub('httpaddr ', email_contents)
    
    rx = re.compile('[^\s]+@[^\s]+')
    email_contents = rx.sub('emailaddr ', email_contents)
    
    rx = re.compile('[$]+')
    email_contents = rx.sub('dollar ', email_contents)
    
    l = 0
    
    rx = re.compile('[^a-zA-Z0-9 ]')
    email_contents = rx.sub('', email_contents).split()
    
    for str in email_contents:
        try:
            str = porterStemmer(str.strip())
        except:
            str = ''
            continue
        # Skip the word if it is too short
        if len(str) < 1:
            continue
        
        vocabSize = len(vocabList)
        
        for index in range(vocabSize):
            if vocabList[index] == str:
                if len(word_indices) == 0:
                    word_indices = [index]
                else:
                    word_indices = np.row_stack((word_indices, [index]))
        
        if (l + len(str) + 1) > 78:
            l = 0
        else:
            l = l + len(str) + 1
    
    return word_indices
    
word_indices = processEmail(''.join(file_contents))
word_indices[:5]

array([[  85],
       [ 915],
       [ 793],
       [1076],
       [ 882]])

In [5]:
def emailFeatures(word_indices):
    n = 1899
    x = np.zeros(n)
    x[word_indices] = 1
    return x
features = emailFeatures(word_indices)
features[:5]

array([ 0.,  0.,  0.,  0.,  0.])

In [6]:
data = scipy.io.loadmat('data/spamTrain.mat')

In [7]:
X = data['X']
y = data['y'].flatten()
print X[:5]
print y[:5]

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[1 1 0 0 0]


In [8]:
model = svm.SVC(C=0.1,kernel='linear')

In [9]:
model.fit(X,y)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
data = scipy.io.loadmat('data/spamTest.mat')

In [11]:
X = data['Xtest']
y = data['ytest'].flatten()

In [12]:
p = model.predict(X)

In [13]:
float(len(np.where(p==y)[0])) / len(y) * 100

98.9

In [14]:
file = open('data/spamSample1.txt', 'r')
file_contents = file.readlines()
word_indices = processEmail(''.join(file_contents))
x = emailFeatures(word_indices)
p = model.predict(x)



In [15]:
p

array([1], dtype=uint8)