In [6]:
%% Machine Learning Online Class
%  Exercise 6 | Spam Classification with SVMs
%
%  Instructions
%  ------------
% 
%  This file contains code that helps you get started on the
%  exercise. You will need to complete the following functions:
%
%     gaussianKernel.m
%     dataset3Params.m
%     processEmail.m
%     emailFeatures.m
%
%  For this exercise, you will not need to change any code in this file,
%  or any other files other than those mentioned above.
%

%% Initialization
%clear ; close all; clc

%% ==================== Part 1: Email Preprocessing ====================
%  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
%  to convert each email into a vector of features. In this part, you will
%  implement the preprocessing steps for each email. You should
%  complete the code in processEmail.m to produce a word indices vector
%  for a given email.

% Extract Features
addpath ("exercises/ex6");
file_contents = readFile('emailSample1.txt');

In [21]:
function word_indices = processEmail(email_contents)
    %PROCESSEMAIL preprocesses a the body of an email and
    %returns a list of word_indices 
    %   word_indices = PROCESSEMAIL(email_contents) preprocesses 
    %   the body of an email and returns a list of indices of the 
    %   words contained in the email. 
    %

    % Load Vocabulary
    vocabList = getVocabList();

    % Init return value
    word_indices = [];

    % ========================== Preprocess Email ===========================

    % Find the Headers ( \n\n and remove )
    % Uncomment the following lines if you are working with raw emails with the
    % full headers

    % hdrstart = strfind(email_contents, ([char(10) char(10)]));
    % email_contents = email_contents(hdrstart(1):end);

    % Lower case
    email_contents = lower(email_contents);

    % Strip all HTML
    % Looks for any expression that starts with < and ends with > and replace
    % and does not have any < or > in the tag it with a space
    email_contents = regexprep(email_contents, '<[^<>]+>', ' ');

    % Handle Numbers
    % Look for one or more characters between 0-9
    email_contents = regexprep(email_contents, '[0-9]+', 'number');

    % Handle URLS
    % Look for strings starting with http:// or https://
    email_contents = regexprep(email_contents, ...
                               '(http|https)://[^\s]*', 'httpaddr');

    % Handle Email Addresses
    % Look for strings with @ in the middle
    email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');

    % Handle $ sign
    email_contents = regexprep(email_contents, '[$]+', 'dollar');


    % ========================== Tokenize Email ===========================

    % Output the email to screen as well
    fprintf('\n==== Processed Email ====\n\n');

    % Process file
    l = 0;

    while ~isempty(email_contents)

        % Tokenize and also get rid of any punctuation
        [str, email_contents] = ...
           strtok(email_contents, ...
                  [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);

        % Remove any non alphanumeric characters
        str = regexprep(str, '[^a-zA-Z0-9]', '');

        % Stem the word 
        % (the porterStemmer sometimes has issues, so we use a try catch block)
        try str = porterStemmer(strtrim(str)); 
        catch str = ''; continue;
        end;

        % Skip the word if it is too short
        if length(str) < 1
           continue;
        end

        % Look up the word in the dictionary and add to word_indices if
        % found
        % ====================== YOUR CODE HERE ======================
        % Instructions: Fill in this function to add the index of str to
        %               word_indices if it is in the vocabulary. At this point
        %               of the code, you have a stemmed word from the email in
        %               the variable str. You should look up str in the
        %               vocabulary list (vocabList). If a match exists, you
        %               should add the index of the word to the word_indices
        %               vector. Concretely, if str = 'action', then you should
        %               look up the vocabulary list to find where in vocabList
        %               'action' appears. For example, if vocabList{18} =
        %               'action', then, you should add 18 to the word_indices 
        %               vector (e.g., word_indices = [word_indices ; 18]; ).
        % 
        % Note: vocabList{idx} returns a the word with index idx in the
        %       vocabulary list.
        % 
        % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        %       str2). It will return 1 only if the two strings are equivalent.
        %

        %vocabList

        index = find(strcmp(vocabList,str));
        if (index)
            word_indices = [word_indices ; index];
        endif

        % =============================================================


        % Print to screen, ensuring that the output lines are not too long
        if (l + length(str) + 1) > 78
            fprintf('\n');
            l = 0;
        end
        fprintf('%s ', str);
        l = l + length(str) + 1;

    end

    % Print footer
    fprintf('\n\n=========================\n');

end

word_indices  = processEmail(file_contents);

% Print Stats
fprintf('Word Indices: \n');
fprintf(' %d', word_indices);
fprintf('\n\n');

    getVocabList at line 9 column 5
    processEmail at line 10 column 15

==== Processed Email ====

index =  86
anyon index =  916
know index =  794
how index =  1077
much index =  883
it index =  370
cost index =  1699
to index =  790
host index = [](0x1)
a index =  1822
web index = [](0x1)
portal index =  1831
well index =  883
it index =  431
depend index =  1171
on index =  794
how index =  1002
mani index = [](0x1)

visitor index =  1893
you index =  1364
re index =  592
expect index =  1676
thi index =  238
can index =  162
be index =  89
anywher index =  688
from index =  945
less index =  1663
than index =  1120
number index = [](0x1)
buck index = [](0x1)
a index =  1062
month index =  1699

to index = [](0x1)
a index =  375
coupl index =  1162
of index =  479
dollarnumb index =  1893
you index =  1510
should index = [](0x1)
checkout index =  799
httpaddr index =  1182
or index =  1237
perhap index = [](0x1)
amazon index = [](0x1)
ecnumb index =  810

if index =  1895
your in

In [2]:
%% ==================== Part 2: Feature Extraction ====================
%  Now, you will convert each email into a vector of features in R^n. 
%  You should complete the code in emailFeatures.m to produce a feature
%  vector for a given email.



fprintf('\nExtracting features from sample email (emailSample1.txt)\n');

% Extract Features
file_contents = readFile('emailSample1.txt');
word_indices  = processEmail(file_contents);
features      = emailFeatures(word_indices);

% Print Stats
fprintf('Length of feature vector: %d\n', length(features));
fprintf('Number of non-zero entries: %d\n', sum(features > 0));


Extracting features from sample email (emailSample1.txt)
    readFile at line 8 column 5
    getVocabList at line 9 column 5
    processEmail at line 10 column 11

==== Processed Email ====

anyon know how much it cost to host a web portal well it depend on how mani 
visitor you re expect thi can be anywher from less than number buck a month 
to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb 
if your run someth big to unsubscrib yourself from thi mail list send an 
email to emailaddr 

Length of feature vector: 1899
Number of non-zero entries: 0


In [None]:
%% =========== Part 3: Train Linear SVM for Spam Classification ========
%  In this section, you will train a linear classifier to determine if an
%  email is Spam or Not-Spam.

% Load the Spam Email dataset
% You will have X, y in your environment
load('spamTrain.mat');

fprintf('\nTraining Linear SVM (Spam Classification)\n')
fprintf('(this may take 1 to 2 minutes) ...\n')

C = 0.1;
model = svmTrain(X, y, C, @linearKernel);

p = svmPredict(model, X);

fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);



Training Linear SVM (Spam Classification)
(this may take 1 to 2 minutes) ...



In [None]:
%% =================== Part 4: Test Spam Classification ================
%  After training the classifier, we can evaluate it on a test set. We have
%  included a test set in spamTest.mat

% Load the test dataset
% You will have Xtest, ytest in your environment
load('spamTest.mat');

fprintf('\nEvaluating the trained Linear SVM on a test set ...\n')

p = svmPredict(model, Xtest);

fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);

In [None]:
%% ================= Part 5: Top Predictors of Spam ====================
%  Since the model we are training is a linear SVM, we can inspect the
%  weights learned by the model to understand better how it is determining
%  whether an email is spam or not. The following code finds the words with
%  the highest weights in the classifier. Informally, the classifier
%  'thinks' that these words are the most likely indicators of spam.
%

% Sort the weights and obtin the vocabulary list
[weight, idx] = sort(model.w, 'descend');
vocabList = getVocabList();

fprintf('\nTop predictors of spam: \n');
for i = 1:15
    fprintf(' %-15s (%f) \n', vocabList{idx(i)}, weight(i));
end

fprintf('\n\n');

In [None]:
%% =================== Part 6: Try Your Own Emails =====================
%  Now that you've trained the spam classifier, you can use it on your own
%  emails! In the starter code, we have included spamSample1.txt,
%  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
%  The following code reads in one of these emails and then uses your 
%  learned SVM classifier to determine whether the email is Spam or 
%  Not Spam

% Set the file to be read in (change this to spamSample2.txt,
% emailSample1.txt or emailSample2.txt to see different predictions on
% different emails types). Try your own emails as well!
filename = 'spamSample1.txt';

% Read and predict
file_contents = readFile(filename);
word_indices  = processEmail(file_contents);
x             = emailFeatures(word_indices);
p = svmPredict(model, x);

fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
fprintf('(1 indicates spam, 0 indicates not spam)\n\n');

