# Check Modules and Download Dataset

In [1]:
print()
print("checking for nltk")
try:
    import nltk
except ImportError:
    print("you should install nltk before continuing")

print("checking for numpy")
try:
    import numpy
except ImportError:
    print("you should install numpy before continuing")

print("checking for scipy")
try:
    import scipy
except:
    print("you should install scipy before continuing")

print("checking for sklearn")
try:
    import sklearn
except:
    print("you should install sklearn before continuing")

print()
print("downloading the Enron dataset (this may take a while)")
print("to check on progress, you can cd up one level, then execute <ls -lthr>")
print("Enron dataset should be last item on the list, along with its current size")
print("download will complete at about 423 MB")
import urllib.request
url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz"
urllib.request.urlretrieve(url, filename="../enron_mail_20150507.tgz") 
print("download complete!")


print()
print("unzipping Enron dataset (this may take a while)")
import tarfile
import os
os.chdir("..")
tfile = tarfile.open("enron_mail_20150507.tgz", "r:gz")
tfile.extractall(".")

print("you're ready to go!")




checking for nltk
checking for numpy
checking for scipy
checking for sklearn

downloading the Enron dataset (this may take a while)
to check on progress, you can cd up one level, then execute <ls -lthr>
Enron dataset should be last item on the list, along with its current size
download will complete at about 423 MB
download complete!

unzipping Enron dataset (this may take a while)
you're ready to go!


# Prepare Dataset

In [2]:
import pickle
import numpy

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif



def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "rb")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "rb")
    word_data = pickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print("no. of Chris training emails:", sum(labels_train))
    print("no. of Sara training emails:", len(labels_train)-sum(labels_train))
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test



# Naive Bayes Classifier

In [3]:
""" 
    This is the code to accompany the Lesson 1 (Naive Bayes) mini-project. 

    Use a Naive Bayes Classifier to identify emails by their authors
    
    authors and labels:
    Sara has label 0
    Chris has label 1
"""

from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB

# Create classifier
clf = GaussianNB()

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test)) # or

from sklearn.metrics import accuracy_score
print(accuracy_score(pred, labels_test))

#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 2.069 s
prediction time: 0.295 s
0.973265073948
0.973265073948


# SVM Classifier - Linear Kernel

In [4]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.
    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
  
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn import svm

# Create classifier
clf = svm.SVC(kernel = "linear")

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 209.952 s
prediction time: 21.833 s
0.984072810011


# SVM Classifier - Linear Kernel - Smaller Training Dataset

In [5]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.
    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
  
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

# These lines effectively slice the training dataset down to 1% of its original size,
# tossing out 99% of the training data.
features_train = features_train[:len(features_train)//100] 
labels_train = labels_train[:len(labels_train)//100] 

from sklearn import svm

# Create classifier
clf = svm.SVC(kernel = "linear")

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 0.124 s
prediction time: 1.313 s
0.884527872582


# SVM Classifier - RBF Kernel - Smaller Training Dataset

In [6]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.
    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
  
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

# These lines effectively slice the training dataset down to 1% of its original size,
# tossing out 99% of the training data.
features_train = features_train[:len(features_train)//100] 
labels_train = labels_train[:len(labels_train)//100] 

from sklearn import svm

# Create classifier
clf = svm.SVC(kernel = "rbf", C = 10000)

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))

# What class does your SVM (0 or 1, corresponding to Sara and Chris respectively)
# predict for element 10 of the test set? The 26th? The 50th?
print(pred[10])
print(pred[26])
print(pred[50])
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 0.138 s
prediction time: 1.195 s
0.892491467577
1
0
1


# SVM Classifier - RBF Kernel

In [7]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.
    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
  
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn import svm

# Create classifier
clf = svm.SVC(kernel = "rbf", C = 10000)

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 148.786 s
prediction time: 15.764 s
0.990898748578


# SVM Classifier - RBF Kernel
## How Many Chris Emails Predicted?

In [8]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.
    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
  
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn import svm

# Create classifier
clf = svm.SVC(kernel = "rbf", C = 10000)

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Find amount of Chris Emails predicted
count = 0
for i in pred:
    if i == 1:
        count +=1

print(count)

# Alternatively, because we have only 0s and 1s, the sum of pred[i] will give the answer.
print(sum(pred))

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 136.576 s
prediction time: 13.663 s
877
877
0.990898748578


# Decision Trees Classifier

In [9]:
""" 
    This is the code to accompany the Lesson 3 (decision tree) mini-project.
    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
    
from time import time

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn import tree

# Create classifier
clf = tree.DecisionTreeClassifier(min_samples_split = 40)

# Fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time()-t0, 3), "s")

# Make prediction - Store predictions in a list named pred
t1 = time()
pred = clf.predict(features_test)
print("prediction time:", round(time()-t1, 3), "s")

# Calculate the accuracy on the test data
print(clf.score(features_test, labels_test))
#########################################################

no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time: 61.897 s
prediction time: 0.032 s
0.978384527873


## What's the number of features in the data? 

In [10]:
len(features_train[0])

3785