In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()
yy_test = np.ones(y_test.shape)
yy_test[y_test == 0] = -1

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################
Cvals = [0.01,0.03,0.1,0.3,1,3]
iterationvals = [2000,5000,10000]
learning_ratevals = [1e-3,1e-2,1e-1]

best_C = 0
best_learning_rate = 0
best_iteration = 0
best_val_acc = 0

for C in Cvals:
    for iteration in iterationvals: 
        for learning_rate in learning_ratevals:
            print "training with C: {}, num_iters: {}, learning_rate: {}".format(C, iteration, learning_rate)
            
            svm = LinearSVM_twoclass()
            svm.theta = np.zeros((X.shape[1],))
            svm.train(X, yy, learning_rate, reg=C, num_iters=iteration, verbose=False)
    
            val_acc = np.mean(svm.predict(X_test) == yy_test)
        
            print "When C = {}, num_iters = {}, learning_rate = {}, validation accuracy = {}".format(C, iteration, learning_rate, val_acc)
            print("-"*70)
            
            if val_acc >= best_val_acc:
                best_C = C
                best_iteration = iteration
                best_learning_rate = learning_rate
                
print "So the best paramater set we derive is:", best_C, best_iteration, best_learning_rate
            


training with C: 0.01, num_iters: 2000, learning_rate: 0.001
When C = 0.01, num_iters = 2000, learning_rate = 0.001, validation accuracy = 0.692
----------------------------------------------------------------------
training with C: 0.01, num_iters: 2000, learning_rate: 0.01
When C = 0.01, num_iters = 2000, learning_rate = 0.01, validation accuracy = 0.704
----------------------------------------------------------------------
training with C: 0.01, num_iters: 2000, learning_rate: 0.1
When C = 0.01, num_iters = 2000, learning_rate = 0.1, validation accuracy = 0.965
----------------------------------------------------------------------
training with C: 0.01, num_iters: 5000, learning_rate: 0.001
When C = 0.01, num_iters = 5000, learning_rate = 0.001, validation accuracy = 0.692
----------------------------------------------------------------------
training with C: 0.01, num_iters: 5000, learning_rate: 0.01
When C = 0.01, num_iters = 5000, learning_rate = 0.01, validation accuracy = 0.853

In [3]:
##################################################################################
# YOUR CODE HERE for testing your best model's perfor                            #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()
yy_test = np.ones(y_test.shape)
yy_test[y_test == 0] = -1

svm = LinearSVM_twoclass()
svm.theta = np.zeros((X.shape[1],))
svm.train(X, yy, learning_rate=0.001, reg=10 , num_iters=20000, verbose=True, step=2000)

train_acc = np.mean(svm.predict(X) == yy)
val_acc = np.mean(svm.predict(X_test) == yy_test)

print "The accuracy for training data and validation set is, respectively, :", train_acc, val_acc

##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

theta = svm.theta
np.argsort(theta)[::-1]
for i in np.argsort(theta)[::-1]:
    print(words[i + 1], theta[i])
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

iteration 0 / 20000: loss 10.000000
iteration 2000 / 20000: loss 0.594458
iteration 4000 / 20000: loss 0.421616
iteration 6000 / 20000: loss 0.328688
iteration 8000 / 20000: loss 0.266898
iteration 10000 / 20000: loss 0.221811
iteration 12000 / 20000: loss 0.191516
iteration 14000 / 20000: loss 0.168281
iteration 16000 / 20000: loss 0.150887
iteration 18000 / 20000: loss 0.137157
The accuracy for training data and validation set is, respectively, : 0.99475 0.992
('click', 0.55088999417872364)
('remov', 0.49984704969768418)
('our', 0.4868218907150626)
('basenumb', 0.39955397125185271)
('guarante', 0.39709629761849857)
('visit', 0.3585470533405733)
('pleas', 0.34293886968172976)
('will', 0.31763412116130324)
('dollar', 0.31396825064949996)
('nbsp', 0.2957197022739087)
('free', 0.28059292729144875)
('hour', 0.27096719349253517)
('most', 0.26910050681490127)
('enumb', 0.25995411841213795)
('price', 0.25642833971135792)
('am', 0.25574243362481092)
('below', 0.25407559356381237)
('dollarnumb