In [4]:
# %load 'poi_id.py'
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### I will keep all of the features in the list
### I will use SelectKBest to identify the features


features_list = ['poi','loan_advances', 'deferred_income', 'exercised_stock_options', 
                 'long_term_incentive', 'from_poi_to_this_person']



### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

    
    
### Task 2: Remove outliers

data_dict.pop( 'TOTAL', 0 )  #remove the total line
data_dict.pop('LOCKHART EUGENE E',0) #no data




### Task 3: Create new feature(s)
# A feature called "per_stock" that calculates the ratio of total payments to total stock value
for i in data_dict:
    
    #if no stock value, can't do calculation. 0 result.
    if data_dict[i]['total_stock_value']=="NaN":
        data_dict[i]['per_stock'] = "NaN"
        
    #if no total payments, can't do calculation. 0 result.    
    elif data_dict[i]['total_payments']=="NaN":
        data_dict[i]['per_stock'] = "NaN"
    
    #else, per_stock is the total payments / total stock value
    else:
        data_dict[i]['per_stock']= (float(data_dict[i]['total_payments'] )) / (float(data_dict[i]['total_stock_value']))
        
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


### Task 4: Try a varity of classifiers

#import various sklearn packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

#define parts of the pipeline I will be using
scaler = MinMaxScaler()
skb = SelectKBest()
pca = PCA(n_components=2)

#algorithm choices
dt = DecisionTreeClassifier(random_state=0)

###set parameter dictionaries for GridSearchCV

#SelectKBest__k
param_skb = [2,3,4,5,6,7,8]

parameters_dt = {'SelectKBest__k': [5],
                'Algorithm__min_samples_split' : [1, 2, 3, 4],
                'Algorithm__min_samples_leaf' : [1,2,3,4]}



### Task 5: Tune your classifier to achieve better than .3 precision and recall 

#create pipeline
pipe = Pipeline(steps = [
        ("Scaler", scaler ),
        ("SelectKBest", skb),
        ("Algorithm", dt)
    ])


#perform GridSearch on pipeline
gs = GridSearchCV(pipe, parameters_dt, scoring='f1')




from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

#fit the gridsearch on training data
gs.fit(features_train, labels_train)

#assign gridsearch result to classifier
clf = gs.best_estimator_

#get the parameters returned by GridSearch
best_parameters = gs.best_estimator_.get_params()


print " "
print "Optimal Model - by Grid Search"
print clf
print " "

features_new=[features_list[1:][i]for i in clf.named_steps['SelectKBest'].get_support(indices=True)]
print features_new
print " "


# Print Results  (will print the Grid Search score)
from sklearn.metrics import classification_report
from tester import test_classifier, dump_classifier_and_data

labels_pred = gs.predict(features)

print "Grid Search Classification report:" 
print " "
print classification_report(labels, labels_pred)
print ' ' 

# Print Results  (will print the tester.py score)
print "tester.py Classification report:" 
print " "
test_classifier(clf, my_dataset, features_list)
print " "

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

 
Optimal Model - by Grid Search
Pipeline(steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('SelectKBest', SelectKBest(k=5, score_func=<function f_classif at 0x00000000148E6358>)), ('Algorithm', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            random_state=0, splitter='best'))])
 
['loan_advances', 'deferred_income', 'exercised_stock_options', 'long_term_incentive', 'from_poi_to_this_person']
 
Grid Search Classification report:
 
             precision    recall  f1-score   support

        0.0       0.97      0.92      0.95       117
        1.0       0.62      0.83      0.71        18

avg / total       0.93      0.91      0.92       135

 
tester.py Classification report:
 
Pipeline(steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('SelectKBest', SelectKBest(k=5, score_func=<fu