In [5]:
import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )


### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

# Overfit decision tree training and accuracy

In [6]:
from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train,labels_train)
pred = clf.predict(features_test)

print "Accuracy =", accuracy_score(pred, labels_test)*100,"%"

Accuracy = 95.0511945392 %


# Most important feature

In [8]:
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0.2:
        print "Feature importance =", clf.feature_importances_[i]
        print "Feature number =", i
        
print "Most important word is", vectorizer.get_feature_names()[14343]

Feature importance = 0.666666666667
Feature number = 14343
Most important word is cgermannsf


# Remove sshacklens and cgermannsf

In [None]:
import os
import pickle
import re
import sys
from nltk.stem.snowball import SnowballStemmer
import string
sys.path.append( "../tools/" )

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### project part 2: comment out the line below
        ### words = text_string

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        stemmer = SnowballStemmer("english")
        words = ""
        for word in text_string.split():
            words += stemmer.stem(word)+" "

    return words 

from_sara  = open("../text_learning/from_sara.txt", "r")
from_chris = open("../text_learning/from_chris.txt", "r")

from_data = []
word_data = []

### temp_counter is a way to speed up the development--there are
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        ### temp_counter += 1
        if temp_counter < 100:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            text = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            text = text.replace("sara","").replace("shackleton","").replace("chris","").replace("germani","").replace("sshacklens","").replace("cgermannsf","")
            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if (name == "sara"):
                from_data.append("0")
            else:
                from_data.append("1")

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("../text_learning/your_word_data.pkl", "w") )
pickle.dump( from_data, open("../text_learning/your_email_authors.pkl", "w") )

..\maildir/bailey-s/deleted_items/101.
..\maildir/bailey-s/deleted_items/106.
..\maildir/bailey-s/deleted_items/132.
..\maildir/bailey-s/deleted_items/185.
..\maildir/bailey-s/deleted_items/186.
..\maildir/bailey-s/deleted_items/187.
..\maildir/bailey-s/deleted_items/193.
..\maildir/bailey-s/deleted_items/195.
..\maildir/bailey-s/deleted_items/214.
..\maildir/bailey-s/deleted_items/215.
..\maildir/bailey-s/deleted_items/233.
..\maildir/bailey-s/deleted_items/242.
..\maildir/bailey-s/deleted_items/243.
..\maildir/bailey-s/deleted_items/244.
..\maildir/bailey-s/deleted_items/246.
..\maildir/bailey-s/deleted_items/247.
..\maildir/bailey-s/deleted_items/254.
..\maildir/bailey-s/deleted_items/259.
..\maildir/bailey-s/deleted_items/260.
..\maildir/bailey-s/deleted_items/261.
..\maildir/bailey-s/deleted_items/263.
..\maildir/bailey-s/deleted_items/278.
..\maildir/bailey-s/deleted_items/290.
..\maildir/bailey-s/deleted_items/296.
..\maildir/bailey-s/deleted_items/302.
..\maildir/bailey-s/delet

# New decision tree accuracy and important words

In [10]:
import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )


### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train,labels_train)
pred = clf.predict(features_test)

print "Accuracy =", accuracy_score(pred, labels_test)*100,"%"

for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0.2:
        print "Feature importance =", clf.feature_importances_[i]
        print "Feature number =", i

Accuracy = 81.6268486917 %
Feature importance = 0.216297993167
Feature number = 18849
Feature importance = 0.420772351027
Feature number = 21323


In [11]:
print "Most important words are", vectorizer.get_feature_names()[18849],"and",vectorizer.get_feature_names()[21323]

Most important words are fax and houectect
