In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ["poi",
                 "salary",
                 "deferral_payments",
                 "loan_advances",
                 "bonus",
                 "restricted_stock_deferred",
                 "deferred_income",
                 "total_stock_value",
                 "expenses",
                 "exercised_stock_options",
                 "other",
                 "long_term_incentive",
                 "restricted_stock",
                 "director_fees",
                 "from_poi_to_this_person_ratio",
                 "from_this_person_to_poi_ratio"]

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
# From previous research we realise that the "TOTAL" column had been included
del data_dict["TOTAL"]



In [2]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

# Feature that expresses emails to/from poi as a ratio of total emails
# Where there is no feature we save "NaN" as per featureFormat
for person in my_dataset.keys():
    person_data = my_dataset[person]
    # Create ratio for received messages
    if person_data["to_messages"] != 0 and person_data["to_messages"] != "NaN":
        if person_data["from_poi_to_this_person"] != "NaN":
            person_data["from_poi_to_this_person_ratio"] = (
                person_data["from_poi_to_this_person"] / float(person_data["to_messages"]))
        else:
            person_data["from_poi_to_this_person_ratio"] = "NaN"
    else:
        person_data["from_poi_to_this_person_ratio"] = "NaN"
    # Create ratio for from messages
    if person_data["from_messages"] != 0 and person_data["from_messages"] != "NaN":
        if person_data["from_this_person_to_poi"] != "NaN":
            person_data["from_this_person_to_poi_ratio"] = (
                person_data["from_this_person_to_poi"] / float(person_data["from_messages"]))
        else:
            person_data["from_this_person_to_poi_ratio"] = "NaN"
    else:
        person_data["from_this_person_to_poi_ratio"] = "NaN"

In [17]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Select K Best
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=3)

In [19]:
# Select classifer here
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#clf = GaussianNB()
clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30)

# Pipeline estimators - each should be ("name", estimator) format
from sklearn.pipeline import Pipeline, FeatureUnion
estimators = [("select", selector),
              ("clf", clf)]
pipe = Pipeline(estimators)

In [13]:
pipe.get_params().keys()

['clf__class_weight',
 'clf__splitter',
 'clf__max_leaf_nodes',
 'clf',
 'clf__max_depth',
 'select__k',
 'clf__min_samples_leaf',
 'clf__min_weight_fraction_leaf',
 'clf__presort',
 'select__score_func',
 'steps',
 'clf__min_samples_split',
 'clf__min_impurity_split',
 'clf__max_features',
 'clf__criterion',
 'select',
 'clf__random_state']

In [15]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

# GridSearchCV
from sklearn.model_selection import GridSearchCV
params = {
    "select__k": [2, 3, 4, 5, 6],
    "clf__min_samples_split": [2, 10, 20, 30, 40, 50]
}
gridsearch = GridSearchCV(pipe, param_grid=params)

gridsearch.fit(features_train, labels_train)
print gridsearch.cv_results_

{'std_train_score': array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.02522089,  0.01845457,  0.02089446,  0.02089446,  0.02089446,
        0.01831332,  0.03885828,  0.04903801,  0.04903801,  0.04903801,
        0.03692838,  0.04189552,  0.04189552,  0.04189552,  0.04189552,
        0.03692838,  0.04189552,  0.04189552,  0.04189552,  0.04189552,
        0.03692838,  0.04189552,  0.04189552,  0.04189552,  0.04189552]), 'rank_test_score': array([26, 26, 26, 17, 22, 30, 17, 17, 17, 17, 26, 13, 14, 14, 14, 22,  1,
        1,  1,  1, 22,  1,  1,  1,  1, 22,  1,  1,  1,  1], dtype=int32), 'mean_score_time': array([ 0.00100001,  0.00060527,  0.00039864,  0.00029572,  0.00029135,
        0.00026234,  0.00025733,  0.00026107,  0.00027704,  0.00027068,
        0.00026393,  0.00027609,  0.00031002,  0.00025598,  0.00028499,
        0.00027903,  0.00032473,  0.00028634,  0.0002737 ,  0.000259  ,
        0.00028865,  0.00026631,  0.00025765,  0.00030224,  0.00025741,

In [16]:
print gridsearch.best_estimator_
# What features were selected?
# Further tuning for GaussianNB

Pipeline(steps=[('select', SelectKBest(k=3, score_func=<function f_classif at 0x1110d0a28>)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])


In [20]:
pipe.fit(features_train, labels_train)
pred = pipe.predict(features_test)

In [21]:
# Create a function to test our classifier
# More detailed stats
def clf_test(labels, pred):
    if len(labels) != len(pred):
        print "Lists not of same length"
        return
    tp = 0 # True positives
    tn = 0 # True negatives
    fp = 0 # False positives
    fn = 0 # False negatives
    for i in range(len(labels)):
        if labels[i] == pred[i]:
            if labels[i] == 1:
                tp += 1
            else:
                tn += 1
        else:
            if labels[i] == 0:
                fp += 1
            else:
                fn += 1
    print "Accuracy: ", ((tp+tn) / float(len(labels)))
    print "Recall: ", float(tp) / (tp + fn)
    print "Precision: ", float(tp) / (tp + fp)
clf_test(labels_test, pred)

Accuracy:  0.840909090909
Recall:  0.6
Precision:  0.375


In [None]:
    
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

#dump_classifier_and_data(clf, my_dataset, features_list)