In [131]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
sys.path.append("../tools/")
import matplotlib.pyplot as plt
from tester import test_classifier
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
%matplotlib inline
from sklearn.metrics import accuracy_score

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [132]:
dataset = data_dict

In [133]:
#Here we have all the names of the people in the dataset
print dataset.keys()

['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIMOTHY J', 'WAKEHAM JOHN', 'POWERS WILLIAM', 'GOLD JOSEPH', 'BANNANTINE JAMES M', 'DUNCAN JOHN H', 'SHAPIRO RICHARD S', 'SHERRIFF JOHN R', 'SHELBY 

In [134]:
#There are 146 total people in the dataset
len(dataset)

146

In [135]:
#Famous Enron poi
print dataset['LAY KENNETH L']

{'salary': 1072321, 'to_messages': 4273, 'deferral_payments': 202911, 'total_payments': 103559793, 'exercised_stock_options': 34348384, 'bonus': 7000000, 'restricted_stock': 14761694, 'shared_receipt_with_poi': 2411, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 49110078, 'expenses': 99832, 'loan_advances': 81525000, 'from_messages': 36, 'other': 10359729, 'from_this_person_to_poi': 16, 'poi': True, 'director_fees': 'NaN', 'deferred_income': -300000, 'long_term_incentive': 3600000, 'email_address': 'kenneth.lay@enron.com', 'from_poi_to_this_person': 123}


In [136]:
#Comapared to average employee
print dataset['METTS MARK']

{'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN', 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740, 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address': 'mark.metts@enron.com', 'from_poi_to_this_person': 38}


In [137]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = ['poi','salary', 'to_messages', 'deferral_payments', 'total_payments', 'bonus']

In [138]:
enron_data = pd.DataFrame(dataset)

enron_data = enron_data.transpose()

In [139]:
### Task 2: Remove outliers
dataset.pop('TOTAL', 0)
dataset.pop('THE TRAVEL AGENCY IN THE PARK', 0)

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [140]:
print enron_data.columns.values

['bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'email_address' 'exercised_stock_options' 'expenses' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi' 'loan_advances'
 'long_term_incentive' 'other' 'poi' 'restricted_stock'
 'restricted_stock_deferred' 'salary' 'shared_receipt_with_poi'
 'to_messages' 'total_payments' 'total_stock_value']


In [141]:
### Task 3: Create new feature(s)
#Scan of the dataset showed that poi's had a very high "total stock value"
def high_bonuses(row):
    if row['bonus'] > 8000000:
        return True
    else:
        return False


enron_data["Unusually_High_Bonus"] = enron_data.apply(high_bonuses, axis=1)


In [142]:
enron_data["Unusually_High_Bonus"].head()

ALLEN PHILLIP K       False
BADUM JAMES P          True
BANNANTINE JAMES M     True
BAXTER JOHN C         False
BAY FRANKLIN R        False
Name: Unusually_High_Bonus, dtype: bool

In [143]:
### Extract features and labels from dataset for local testing
data = featureFormat(dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [144]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

In [145]:
# Example starting point. Try investigating other evaluation techniques!
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [146]:
# Provided to give you a starting point. Try a variety of classifiers.

clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.80487804878


In [147]:
print enron_data.columns.values

['bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'email_address' 'exercised_stock_options' 'expenses' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi' 'loan_advances'
 'long_term_incentive' 'other' 'poi' 'restricted_stock'
 'restricted_stock_deferred' 'salary' 'shared_receipt_with_poi'
 'to_messages' 'total_payments' 'total_stock_value' 'Unusually_High_Bonus']


In [148]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

#I came to find these features achieved better than .3 precision by trial and error
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value']

test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.79600	Precision: 0.30741	Recall: 0.42300	F1: 0.35606	F2: 0.39342
	Total predictions: 15000	True positives:  846	False positives: 1906	False negatives: 1154	True negatives: 11094



In [149]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, dataset, features_list)