# Spam Classification with SVMs

In [1]:
import sys
# needed modules
path_to_modules = '../../algorithms_in_python/week_7/ex6/'
sys.path.append(path_to_modules)
import numpy as np
import scipy.io as sio
# we use scikit-learn's svm since the simple SMO algorithm is only meant for small datasets
# from svmModel import SVMModel
from sklearn import svm

from emailFeatures import email_features
from getVocabList import get_vocab_list
from processEmail import process_email
from readFile import read_file
from svmModel import SVMModel

## Part 1: Email Pre-processing
  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
  to convert each email into a vector of features. In this part, you will
  implement the pre-processing steps for each email. You should
  complete the code in processEmail.py to produce a word indices vector
  for a given email.

In [2]:
print('\nPre-processing sample email (emailSample1.txt)\n')

# Extract Features
file_contents = read_file(path_to_modules + 'emailSample1.txt')
word_indices = process_email(file_contents, path_to_modules + 'vocab.txt')

# Print Stats
print('Word Indices: \n')
print(word_indices)


Pre-processing sample email (emailSample1.txt)


==== Processed Email ====

  anyone knows how much it costs to host a web portal well it depends on how many visitors youre expectingthis can be anywhere from less than number bucks a month to a couple of dollarnumberyou should checkout httpaddr or perhaps amazon ecnumberif youre running something bigto unsubscribe yourself from this mailing list send an email toemailaddr
Word Indices: 

[86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 1831, 883, 431, 1171, 794, 1002, 1895, 238, 162, 89, 688, 945, 1663, 1120, 1062, 1699, 375, 1162, 1510, 799, 1182, 1237, 1895, 1440, 1547, 1758, 1896, 688, 1676, 992, 961, 1477, 71, 530]


## Part 2: Feature Extraction
  Convert each email into a vector of features in R^n.

In [3]:
print('\nExtracting features from sample email (emailSample1.txt)\n')

# Extract Features
features = email_features(word_indices)

# Print Stats
print('Length of feature vector: ', len(features))
print('Number of non-zero entries: ', sum(features > 0))


Extracting features from sample email (emailSample1.txt)

Length of feature vector:  1899
Number of non-zero entries:  38


## Part 3: Train Linear SVM for Spam Classification 
  Train a linear classifier to determine if an email is Spam or Not-Spam.

In [4]:
# Load the Spam Email dataset
# You will have X, y in your environment
mat_contents = sio.loadmat(path_to_modules + 'spamTrain.mat')
X = mat_contents['X']
y = mat_contents['y'].flatten()

print('\nTraining Linear SVM (Spam Classification)\n')
print('(this may take 1 to 2 minutes) ...\n')

C = 0.1
#model = svm.LinearSVC(C=C)
model = svm.SVC(C=C, kernel='linear', tol=1e-3, max_iter=200)
model.fit(X, y)
#model = SVMModel()
#model.train(X, y, C, kernel_type='lnr')

p = model.predict(X)

print('Training Accuracy: ', np.multiply(np.mean((p == y).astype(int)), 100))


Training Linear SVM (Spam Classification)

(this may take 1 to 2 minutes) ...





Training Accuracy:  99.6


## Part 4: Test Spam Classification
  After training the classifier, we can evaluate it on a test set. We have
  included a test set in spamTest.mat

In [5]:
# Load the test dataset
# You will have Xtest, ytest in your environment
mat_contents = sio.loadmat(path_to_modules + 'spamTest.mat')
Xtest = mat_contents['Xtest']
ytest = mat_contents['ytest'].flatten()

print('\nEvaluating the trained Linear SVM on a test set ...\n')
p = model.predict(Xtest)

print('Test Accuracy: ', np.multiply(np.mean((p == ytest).astype(int)), 100))


Evaluating the trained Linear SVM on a test set ...

Test Accuracy:  98.1


## Part 5: Top Predictors of Spam 
  Since the model we are training is a linear SVM, we can inspect the
  weights learned by the model to understand better how it is determining
  whether an email is spam or not. The following code finds the words with
  the highest weights in the classifier. Informally, the classifier
  'thinks' that these words are the most likely indicators of spam.

In [6]:
# Sort the weights, result a list of tuples (index, value)
weights = sorted(enumerate(model.coef_[0]), key=lambda x: x[1], reverse=True)
vocabList = get_vocab_list(path_to_modules + 'vocab.txt')

print('\nTop predictors of spam: \n')
print('%-12s%-12s' % ("Word", "Weight"))
print('%-12s%-12s\n' % ("___", "___"))
for i in range(15):
    print ('%-12s%-12f' % (vocabList[weights[i][0]], weights[i][1]))

print('\n')



Top predictors of spam: 

Word        Weight      
___         ___         

our         0.391337    
click       0.379293    
remov       0.365469    
visit       0.335558    
guarante    0.327508    
basenumb    0.292663    
dollar      0.258750    
bodi        0.233422    
ga          0.222452    
below       0.212933    
price       0.209824    
most        0.201052    
will        0.199862    
al          0.198814    
am          0.189730    




## Part 6: Try Your Own Emails 
  Now that we've trained the spam classifier, we can use it on your own
  emails! In the starter code, we have included spamSample1.txt,
  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
  The following code reads in one of these emails and then uses your 
  learned SVM classifier to determine whether the email is Spam or 
  Not Spam

In [7]:
def classify_email(filename):
    x = email_features(process_email(read_file(filename), path_to_modules + 'vocab.txt')).reshape(1, -1)
    pred = model.predict(x)
    print('\nProcessed {}\n\nSpam Classification: {}\n'.format(filename, pred))
    print('(1 indicates spam, 0 indicates not spam)\n\n')

In [8]:
classify_email(path_to_modules + 'emailSample1.txt')
classify_email(path_to_modules + 'emailSample2.txt')
classify_email(path_to_modules + 'spamSample1.txt')
classify_email(path_to_modules + 'spamSample2.txt')


==== Processed Email ====

  anyone knows how much it costs to host a web portal well it depends on how many visitors youre expectingthis can be anywhere from less than number bucks a month to a couple of dollarnumberyou should checkout httpaddr or perhaps amazon ecnumberif youre running something bigto unsubscribe yourself from this mailing list send an email toemailaddr

Processed ../../algorithms_in_python/week_7/ex6/emailSample1.txt

Spam Classification: [0]

(1 indicates spam, 0 indicates not spam)



==== Processed Email ====

 folks my first time posting  have a bit of unix experience but am new to linux just got a new pc at home  dell box with windows xp added a second hard diskfor linux partitioned the disk and have installed suse numbernumber from cd which wentfine except it didnt pick up my monitor i have a dell branded enumberfpp number lcd flat panel monitor and a nvidia geforcenumbertinumber video card both of which are probably too new to feature in suses defaultset i d