In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [1]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################
#X should be scaled to reduce computational time
scaler = preprocessing.StandardScaler().fit(X)
scaleX = scaler.transform(X)

one_array = np.ones(scaleX.shape[0])
XX = np.concatenate([one_array[:,np.newaxis],scaleX],axis=1)

#Seperating out a validation set for optimization of C, num_iters, and learning rates
scaler = preprocessing.StandardScaler().fit(X)
scaleX = scaler.transform(X)

one_array = np.ones(scaleX.shape[0])
XX = np.concatenate([one_array[:,np.newaxis],scaleX],axis=1)

#Seperating out a validation set for optimization of C, num_iters, and learning rates
training_set_size = int(XX.shape[0]*0.8)
random_sampling = np.random.randint(XX.shape[0],size = XX.shape[0])
scrambled_data = XX[random_sampling,:]
scrambled_labels = yy[random_sampling]
XX_train = scrambled_data[:training_set_size,:]
XX_val = scrambled_data[training_set_size:,:]
yy_train = scrambled_labels[:training_set_size]
yy_val = scrambled_labels[training_set_size:]

Cvals = [0.01,0.03,0.1,0.3,1,3,10,30]
#Testing out different C_vals
best_accuracy = 0
best_C = 0
for C in Cvals:
    svm = LinearSVM_twoclass()
    svm.theta = np.zeros((XX_train.shape[1],))
    svm.train(XX_train,yy_train,learning_rate=1e-4,reg=C,num_iters=1000,verbose=False,batch_size=XX_train.shape[0])

    y_pred = svm.predict(XX_val)
    accuracy = metrics.accuracy_score(yy_val,y_pred)
    print "Accuracy on validation data = ", metrics.accuracy_score(yy_val,y_pred)
    if accuracy>best_accuracy:
        best_accuracy = accuracy
        best_C = C
        best_svm = svm

#Testing different learning rates and number of iterations to find optimal one
learning_rates = [1e-2,1e-3,1e-4,1e-5]
num_iters = [1000,5000,10000]
best_accuracy = 0
best_learning_rate = 0
best_num_iter = 0
for learning_rate in learning_rates:
    for num_iter in num_iters:
        svm.theta = np.zeros((XX_train.shape[1],))
        svm.train(XX_train,yy_train,learning_rate,best_C,num_iter,verbose=False,batch_size=XX_train.shape[0])

        y_pred = svm.predict(XX_val)
        accuracy = metrics.accuracy_score(yy_val,y_pred)
        print "Accuracy on validation data = ", metrics.accuracy_score(yy_val,y_pred)
        if accuracy>best_accuracy:
            best_accuracy = accuracy
            best_num_iter = num_iter
            best_learning_rate = learning_rate

print "Best C = ", best_C
print "Best num iter = ", best_num_iter
print "Best learning rate = ", best_learning_rate

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################
yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1

scaler = preprocessing.StandardScaler().fit(X_test)
scaleX = scaler.transform(X_test)

one_array = np.ones(scaleX.shape[0])
XX_test = np.concatenate([one_array[:,np.newaxis],scaleX],axis=1)

y_pred = best_svm.predict(XX_test)
print "Accuracy on testing data = ", metrics.accuracy_score(yy_test,y_pred)




Accuracy on validation data =  0.9125
Accuracy on validation data =  0.9125
Accuracy on validation data =  0.9125
Accuracy on validation data =  0.94125
Accuracy on validation data =  0.945
Accuracy on validation data =  0.955
Accuracy on validation data =  0.975
Accuracy on validation data =  0.98625
Accuracy on validation data =  0.98875
Accuracy on validation data =  0.99
Accuracy on validation data =  0.99125
Accuracy on validation data =  0.99125
Accuracy on validation data =  0.9875
Accuracy on validation data =  0.9875
Accuracy on validation data =  0.98625
Accuracy on validation data =  0.98875
Accuracy on validation data =  0.99
Accuracy on validation data =  0.955
Accuracy on validation data =  0.98
Accuracy on validation data =  0.98875
Best C =  30
Best num iter =  10000
Best learning rate =  0.01
Accuracy on testing data =  0.3


In [28]:
##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
y_pred = best_svm.predict(XX_test)
print "Accuracy on testing data = ", metrics.accuracy_score(yy_test,y_pred)

y_pred = best_svm.predict(XX)
print "Accuracy on training data = ", metrics.accuracy_score(yy,y_pred)

words, inv_words = utils.get_vocab_dict()

non_one_thetas = best_svm.theta[1:]
sorted_thetas = non_one_thetas.argsort()[-15:][::-1]
for theta in sorted_thetas:
    print words[theta]
#Printed below are the words most associated with spam    
    
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

Accuracy on testing data =  0.977
Accuracy on training data =  0.98425
[0.13298095 0.10820128 0.08204523 0.07765171 0.07716495 0.07148931
 0.07063565 0.06857259 0.06591054 0.06246492 0.06095593 0.05679279
 0.05622434 0.04853656 0.04790067]
clearli
remot
base
otherwis
herba
young
gt
natur
player
franc
believ
york
off
creativ
fb


In [None]:
##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################
#It seems that the linear models do classify the test data very well with a testing data accuracy of 0.977 and
#training data accuracy of 0.98425

#We will test a simple Gaussian Kernel to see if we the performance improves
#Attempting a Gaussian Kernel
sigma = 1

#Utilizing the scaled value of X to avoid overflow in the Gaussian Kernel
K = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in XX_train for x2 in XX_train]).reshape(XX_train.shape[0],XX_train.shape[0])

scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)

KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T
svm.theta = np.zeros((KK.shape[1],))

svm.train(KK,yy_train,learning_rate=best_learning_rate,reg=best_C,num_iters=best_num_iters,verbose=True,batch_size=KK.shape[0])

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

Ktest = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in XX_test for x2 in XX_train]).reshape(XX_test.shape[0],XX_train.shape[0])
scaler = preprocessing.StandardScaler().fit(Kval)
scaleK = scaler.transform(Kval)

KKtest = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T

y_pred = svm.predict(KKtest)
accuracy = metrics.accuracy_score(yy_test,y_pred)
print "Accuracy on testing data = ", metrics.accuracy_score(yy_test,y_pred)


##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

non_one_thetas = best_svm.theta[1:]
sorted_thetas = non_one_thetas.argsort()[-15:][::-1]
for theta in sorted_thetas:
    print words[theta]

##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################