In [1]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1

features_train and features_test are the features for the training
and testing datasets, respectively
labels_train and labels_test are the corresponding item labels.
-------------------------------------------------------------------------
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess



In [2]:
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [3]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## Accuracy of author identification

In [4]:
clf = SVC(kernel="linear")

t0 = time()
clf.fit(features_train, labels_train)
print "training time: ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time: ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier is : ", accuracy

training time:  215.463 s
testing time:  20.163 s
Accuracy of SVC classifier is :  0.984072810011


## A Smaller Training Set
One way to speed up an algorithm is to train it on a smaller training dataset.   
The tradeoff is that the accuracy almost always goes down when you do this.

In [5]:
clf = SVC(kernel="linear")

# Reducing size of training data to improve training time 
features_train = features_train[:len(features_train)/100] 
labels_train = labels_train[:len(labels_train)/100] 

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset is : ", accuracy

training time after using smaller training dataset :  0.125 s
testing time after using smaller training dataset :  1.347 s
Accuracy of SVC classifier after using smaller training dataset is :  0.884527872582


__Only 1% of the features, but over 88% the performance?  Not too shabby!__

## Deploying RBF kernel

In [6]:
clf = SVC(kernel="rbf")

# Reducing size of training data to improve training time 
#features_train = features_train[:len(features_train)/100] 
#labels_train = labels_train[:len(labels_train)/100] 
# Using the same reduced dataset as used above.

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset and rbf kernel : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset and rbf kernel : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset and rbf kernel is : ", accuracy

training time after using smaller training dataset and rbf kernel :  0.136 s
testing time after using smaller training dataset and rbf kernel :  1.645 s
Accuracy of SVC classifier after using smaller training dataset and rbf kernel is :  0.616040955631


__So accuracy decreases when dataset size is reduced and rbf kernel is used.__ 

## Optimize C Parameter and find which value of C gives better accuracy 

Keeping the __rbf__ kernel and same size of dataset.  
Changing values of C

#### C = 10.0

In [7]:
clf = SVC(kernel="rbf", C = 10.0)

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset and rbf kernel (and C = 10) : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset and rbf kernel (and C = 10) : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 10) is : ", accuracy

training time after using smaller training dataset and rbf kernel (and C = 10) :  0.135 s
testing time after using smaller training dataset and rbf kernel (and C = 10) :  1.447 s
Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 10) is :  0.616040955631


#### C = 100.0

In [8]:
clf = SVC(kernel="rbf", C = 100.0)

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset and rbf kernel (and C = 100.0) : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset and rbf kernel (and C = 100.0) : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 100.0) is : ", accuracy

training time after using smaller training dataset and rbf kernel (and C = 100.0) :  0.17 s
testing time after using smaller training dataset and rbf kernel (and C = 100.0) :  1.63 s
Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 100.0) is :  0.616040955631


#### C = 1000.0

In [9]:
clf = SVC(kernel="rbf", C = 1000.0)

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset and rbf kernel (and C = 1000.0) : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset and rbf kernel (and C = 1000.0) : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 1000.0) is : ", accuracy

training time after using smaller training dataset and rbf kernel (and C = 1000.0) :  0.135 s
testing time after using smaller training dataset and rbf kernel (and C = 1000.0) :  1.287 s
Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 1000.0) is :  0.821387940842


#### C = 10000.0

In [10]:
clf = SVC(kernel="rbf", C = 10000.0)

t0 = time()
clf.fit(features_train, labels_train)
print "training time after using smaller training dataset and rbf kernel (and C = 10000.0) : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time after using smaller training dataset and rbf kernel (and C = 10000.0) : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 10000.0) is : ", accuracy

training time after using smaller training dataset and rbf kernel (and C = 10000.0) :  0.139 s
testing time after using smaller training dataset and rbf kernel (and C = 10000.0) :  1.383 s
Accuracy of SVC classifier after using smaller training dataset and rbf kernel (and C = 10000.0) is :  0.892491467577


In [11]:
# Using the larger dataset
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [12]:
clf = SVC(kernel="rbf", C = 10000.0)

t0 = time()
clf.fit(features_train, labels_train)
print "training time using rbf kernel (and C = 10000.0) : ", round(time()-t0, 3), "s"


t0 = time()
pred = clf.predict(features_test)
print "testing time using rbf kernel (and C = 10000.0) : ", round(time()-t0, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print "Accuracy of SVC classifier after using rbf kernel (and C = 10000.0) is : ", accuracy

training time using rbf kernel (and C = 10000.0) :  140.817 s
testing time using rbf kernel (and C = 10000.0) :  15.599 s
Accuracy of SVC classifier after using rbf kernel (and C = 10000.0) is :  0.990898748578


## Extracting Predictions From An SVM

What class does your SVM (0 or 1, corresponding to Sara and Chris respectively) predict for element 10 of the test set? The 26th? The 50th? (Use the RBF kernel, C=10000, and 1% of the training set. Normally you'd get the best results using the full training set, but we found that using 1% sped up the computation considerably and did not change our results--so feel free to use that shortcut here.)

And just to be clear, the data point numbers that we give here (10, 26, 50) assume a zero-indexed list. So the correct answer for element _100_ would be found using something like __answer=pred[100]__

In [15]:
print pred[100]

0


In [16]:
print pred[10]

1


In [17]:
print pred[26]

0


In [18]:
print pred[50]

1


## How Many Chris Emails Predicted? 
There are over 1700 test events--how many are predicted to be in the “Chris” (1) class? (Use the RBF kernel, C=10000., and the full training set.)

In [19]:
Chris = 0
Sara = 0
for i in pred:
    if i==1:
        Chris += 1
    else:
        Sara += 1

print "Number of predicted Chris' Emails : ", Chris
print "Number of predicted Sara's Emails : ", Sara

Number of predicted Chris' Emails :  877
Number of predicted Sara's Emails :  881
