In [51]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import re
import nltk, nltk.stem
import sklearn.svm as svm

%matplotlib inline

In [2]:
def processEmail(email):
    '''Cleaning up text:
    • Lower-casing
    • Stripping HTML: we remove all the HTML tags, so that only the content remains.
    • Normalizing URLs: All URLs replaced with “httpaddr”.
    • Normalizing Email Addresses: All emails replaced with "emailaddr”.
    • Normalizing Numbers: All numbers replaced with “number”.
    • Normalizing Dollars: All($)replaced with “dollar”'''
    
    email = email.lower()
    #strip tags
    email = re.sub(r'<[^<>]+>', ' ', email)
    #Replace URLs with "httpaddr"
    email = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email)
    #Replace email with "emailaddr"
    email = re.sub(r'[^\s]+@[^\s]+', "emailaddr", email)
    #Replace numbers with "number"
    email = re.sub(r'[\d]+', "number ", email)
    #Replace '$' with "dollar"
    email = re.sub(r'[$]+', "dollar ", email)
    #Remove literal escape characters
    email = re.sub(r'\\[nrtfv]', " ", email)
    
    return email

In [9]:
def tokenizeAndStem(email):
    '''Process the email, tokenize it and stem it'''
    
    #instantiate the stemmer
    stemmer = nltk.stem.porter.PorterStemmer()
    
    email = processEmail(email)
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    token_list = []
    
    for token in tokens:
        #remove any non alpha character still left
        token = re.sub(r'[^a-zA-Z]+', ' ', token)
        token = token.strip()
        
        #Get the stem words from the Porter algo
        stemmed = stemmer.stem(token)
        token_list.append(stemmed)
    return token_list

In [81]:
def vocabDict(reverse=False):
    '''Returns a dictionary with key as index and value as word if reverse = True
    else vice versa'''
    
    vocab = {}
    with open('../Coursera/ML_NG/machine-learning-ex6/ex6/vocab.txt') as fo:
        for line in fo:
            idx, word = line.split()
            if reverse:
                vocab[int(idx)] = word
            else:
                vocab[word] = int(idx)
    return vocab

In [43]:
def textToVec(email):
    '''Takes an email text as an input, process it stem and tokenize it and return a vector'''
    cleaned_email = processEmail(email)
    vocablist = tokenizeAndStem(cleaned_email)
    vocab_dict = vocabDict()

    vec = np.zeros((len(vocab_dict), 1))
    vec_idx = np.array([int(vocab_dict[word]) for word in vocablist if word is not '' and word in vocab_dict])
    vec[vec_idx] = 1
    return vec

In [45]:
#Read the first email file
with open('../Coursera/ML_NG/machine-learning-ex6/ex6/spamSample1.txt') as f:
    email1 = f.read()
vec_1 = textToVec(email1)
print(len(vec_1))
print(sum(vec_1==1))

1899
[48]


Read the train and test files

In [49]:
Xtest = loadmat('../Coursera/ML_NG/machine-learning-ex6/ex6/spamTest.mat')
X_test = Xtest['Xtest']
y_test = Xtest['ytest']

Xtrain = loadmat('../Coursera/ML_NG/machine-learning-ex6/ex6/spamTrain.mat')
X_train = Xtrain['X']
y_train = Xtrain['y']


In [52]:
linear_svm = svm.SVC(kernel='linear', C=0.1)
linear_svm.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [58]:
print('Test accuracy %0.3f' %linear_svm.score(X_test, y_test))
print('Train accuracy %0.3f' %linear_svm.score(X_train, y_train))

Test accuracy 0.989
Train accuracy 0.998


In [107]:
#Check the words that are more likely to be an indicator of a spam

weights_idx = np.argsort(linear_svm.coef_).reshape(linear_svm.coef_.shape[1], 1)[::-1]
vocab = vocabDict(reverse=True)

#print top 10 words
for i in weights_idx[:10]:
    print(vocab[int(i)])

otherwis
clearli
remot
gt
visa
base
doesn
wife
previous
player
