In [11]:
"""
A classic way to overfit an algorithm is by using lots of features and not a lot of training data. You can find the starter 
code in feature_selection/find_signature.py. Get a decision tree up and training on the training data, and print out the 
accuracy. How many training points are there, according to the starter code?
"""

#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../tools/your_word_data.pkl" 
authors_file = "../tools/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)

print clf.score(features_test, labels_test)




1.0


In [24]:
"""
Take your (overfit) decision tree and use the feature_importances_ attribute to get a list of the relative importance of all 
the features being used. We suggest iterating through this list (it’s long, since this is text data) and only printing out the 
feature importance if it’s above some threshold (say, 0.2--remember, if all words were equally important, each one would give 
an importance of far less than 0.01). What’s the importance of the most important feature? What is the number of this feature?

"""

#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../tools/your_word_data.pkl" 
authors_file = "../tools/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)

print clf.score(features_test, labels_test)

# Find the top feature in the decision tree and its relative importance
import numpy as np

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print 'Feature Ranking: '

for i in range(10):
    print "{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]])

    
"""
In order to figure out what words are causing the problem, you need to go back to the TfIdf and use the feature numbers that 
you obtained in the previous part of the mini-project to get the associated words. You can return a list of all the words in 
the TfIdf by calling get_feature_names() on it; pull out the word that’s causing most of the discrimination of the decision 
tree. What is it? Does it make sense as a word that’s uniquely tied to either Chris Germany or Sara Shackleton, a signature of 
sorts?
"""
vectorizer.get_feature_names()[33614]




0.9476678043230944
Feature Ranking: 
1 feature no.33614 (0.764705882353)
2 feature no.33201 (0.134028294862)
3 feature no.19671 (0.0749500333111)
4 feature no.24321 (0.0263157894737)
5 feature no.12617 (0.0)
6 feature no.12623 (0.0)
7 feature no.12622 (0.0)
8 feature no.12621 (0.0)
9 feature no.12620 (0.0)
10 feature no.12619 (0.0)


u'sshacklensf'

In [31]:
"""
This word seems like an outlier in a certain sense, so let’s remove it and refit. Go back to text_learning/vectorize_text.py, 
and remove this word from the emails using the same method you used to remove “sara”, “chris”, etc. Rerun vectorize_text.py, 
and once that finishes, rerun find_signature.py. Any other outliers pop up? What word is it? Seem like a signature-type word? 
(Define an outlier as a feature with importance >0.2, as before).
"""

#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../tools/your_word_data.pkl" 
authors_file = "../tools/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)

print clf.score(features_test, labels_test)

importances = clf.feature_importances_
import numpy as np
indices = np.argsort(importances)[::-1]
print 'Feature Ranking: '
for i in range(10):
    print "{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]])


0.9476678043230944
Feature Ranking: 
1 feature no.33614 (0.764705882353)
2 feature no.33201 (0.134028294862)
3 feature no.19671 (0.0749500333111)
4 feature no.24321 (0.0263157894737)
5 feature no.12617 (0.0)
6 feature no.12623 (0.0)
7 feature no.12622 (0.0)
8 feature no.12621 (0.0)
9 feature no.12620 (0.0)
10 feature no.12619 (0.0)


In [34]:
#!/usr/bin/python

import os
import pickle
import re
import sys

sys.path.append( "../tools/" )


"""
    Starter code to process the emails from Sara and Chris to extract
    the features and get the documents ready for classification.

    The list of all the emails from Sara are in the from_sara list
    likewise for emails from Chris (from_chris)

    The actual documents are in the Enron email dataset, which
    you downloaded/unpacked in Part 0 of the first mini-project. If you have
    not obtained the Enron email corpus, run startup.py in the tools folder.

    The data is stored in lists and packed away in pickle files at the end.
"""


from_sara  = open("from_sara.txt", "r")
from_chris = open("from_chris.txt", "r")

from_data = []
word_data = []

### temp_counter is a way to speed up the development--there are
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            #print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            email_text = parseOutText(email)
            
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            list_rep  = ["sara", "shackleton", "chris", "germani", "sshacklensf"]
            for e in list_rep:
                email_text = email_text.replace(e,"")
            
            ### append the text to word_data
            word_data.append(email_text)
            
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            if name == "chris":
                from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )



print word_data[152]

### in Part 4, do TfIdf vectorization here


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
vectorizer.fit_transform(word_data)

print len(vect.get_feature_names())

NameError: name 'parseOutText' is not defined