In [117]:
#!/usr/bin/python
%matplotlib inline

import sys
import pickle
import pandas as pd
sys.path.append("../tools/")
import matplotlib.pyplot as plt
from tester import test_classifier
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import numpy as np

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

<h3> Basic Analysis </h3>

In [118]:
dataset = data_dict

In [119]:
#There are 146 total people in the dataset
len(dataset)

146

In [120]:
#Famous Enron poi
print dataset['LAY KENNETH L']

{'salary': 1072321, 'to_messages': 4273, 'deferral_payments': 202911, 'total_payments': 103559793, 'exercised_stock_options': 34348384, 'bonus': 7000000, 'restricted_stock': 14761694, 'shared_receipt_with_poi': 2411, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 49110078, 'expenses': 99832, 'loan_advances': 81525000, 'from_messages': 36, 'other': 10359729, 'from_this_person_to_poi': 16, 'poi': True, 'director_fees': 'NaN', 'deferred_income': -300000, 'long_term_incentive': 3600000, 'email_address': 'kenneth.lay@enron.com', 'from_poi_to_this_person': 123}


In [121]:
#Compared to average employee
print dataset['METTS MARK']

{'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN', 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740, 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address': 'mark.metts@enron.com', 'from_poi_to_this_person': 38}


In [122]:
#As a starting point we will include all of the features in our list
features_list = ['poi', 'bonus', 'deferred_income', 'director_fees',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value']

In [123]:
#Here we convert our data to a pandas dataframe, which makes it wasier ot work with
enron_data = pd.DataFrame(dataset)

#Transpose our data to turn names of employees into rows and attributes into columns
enron_data = enron_data.transpose()

In [124]:
# Remove errors, these obviously dont belong in this dataset
dataset.pop('TOTAL', 0)
dataset.pop('THE TRAVEL AGENCY IN THE PARK', 0)

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [125]:
print enron_data.columns.values
print "Total Number of Features: ", len(enron_data.columns.values)

['bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'email_address' 'exercised_stock_options' 'expenses' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi' 'loan_advances'
 'long_term_incentive' 'other' 'poi' 'restricted_stock'
 'restricted_stock_deferred' 'salary' 'shared_receipt_with_poi'
 'to_messages' 'total_payments' 'total_stock_value']
Total Number of Features:  21


<h4> Counting NaN Values </h4>

In [126]:
columns = []

for i in enron_data.columns.values:
    columns.append(i)


for i in columns:
    print i
    print enron_data[enron_data[i]=="NaN"].count()[i]

bonus
64
deferral_payments
107
deferred_income
97
director_fees
129
email_address
35
exercised_stock_options
44
expenses
51
from_messages
60
from_poi_to_this_person
60
from_this_person_to_poi
60
loan_advances
142
long_term_incentive
80
other
53
poi
0
restricted_stock
36
restricted_stock_deferred
128
salary
51
shared_receipt_with_poi
60
to_messages
60
total_payments
21
total_stock_value
20


In [127]:
#Here we see that the names of people categorized as POI. There are only 18 POIs.
print enron_data[enron_data['poi']==True]['poi']

BELDEN TIMOTHY N        True
BOWEN JR RAYMOND M      True
CALGER CHRISTOPHER F    True
CAUSEY RICHARD A        True
COLWELL WESLEY          True
DELAINEY DAVID W        True
FASTOW ANDREW S         True
GLISAN JR BEN F         True
HANNON KEVIN P          True
HIRKO JOSEPH            True
KOENIG MARK E           True
KOPPER MICHAEL J        True
LAY KENNETH L           True
RICE KENNETH D          True
RIEKER PAULA H          True
SHELBY REX              True
SKILLING JEFFREY K      True
YEAGER F SCOTT          True
Name: poi, dtype: object


In [128]:
#Lets manually look at POIs to see if we can get any insights.
print enron_data[enron_data['poi']==True]

                        bonus deferral_payments deferred_income director_fees  \
BELDEN TIMOTHY N      5249999           2144013        -2334434           NaN   
BOWEN JR RAYMOND M    1350000               NaN            -833           NaN   
CALGER CHRISTOPHER F  1250000               NaN         -262500           NaN   
CAUSEY RICHARD A      1000000               NaN         -235000           NaN   
COLWELL WESLEY        1200000             27610         -144062           NaN   
DELAINEY DAVID W      3000000               NaN             NaN           NaN   
FASTOW ANDREW S       1300000               NaN        -1386055           NaN   
GLISAN JR BEN F        600000               NaN             NaN           NaN   
HANNON KEVIN P        1500000               NaN        -3117011           NaN   
HIRKO JOSEPH              NaN             10259             NaN           NaN   
KOENIG MARK E          700000               NaN             NaN           NaN   
KOPPER MICHAEL J       80000

<h4>Basic Observations: </h4>
<br>
None of the POIs have a director fee. Most of them have a bonus. None of them had restricted_stock_deferred, but going back to our previous cell we see that there were 128 NaN values for that column so this doesnt really tell us much. They seem to have a high shared receipt with POI. They also seem to have fairly high total_payments and total_stock_value

In [129]:
#Create new feature
for k, v in dataset.iteritems():
    if v['bonus'] > 8000000:
        dataset[k].update({"Unusually_High_Bonus": 1})
    else:
        dataset[k].update({"Unusually_High_Bonus": 0})



<h2> Testing Classifiers</h2>

In [130]:
#Testing our Classifier without the new feature
features_list = ['poi', 'bonus', 'deferred_income', 'director_fees',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value']

In [131]:
### Extract features and labels from dataset for local testing
data = featureFormat(dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [132]:
# Example starting point. Try investigating other evaluation techniques!
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [147]:
#Naive Bayes Classifier
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


In [134]:
#SVM classifier, literally exact same accuracy as Naive Bayes
clf = svm.SVC()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.883720930233


In [135]:
#Decision Tree Classifier, Surprisingly much lower than SVM and Naive Bayes
#DTC has different accuracy after running multiple times. NB and SVM never change. 
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.837209302326


<h4>Testing New Feature</h4>

In [136]:
#Testing the same classifer with the new feature added
features_list = ['poi', 'bonus', 'deferred_income', 'director_fees',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value', 'Unusually_High_Bonus']

In [137]:
data = featureFormat(dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [138]:
#Our accuracy went down by 4%, so my hypothesis was incorrect and the new feature will not be used. 
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


<h4> Grid Search</h4>

In [146]:
#Grid search with basic 1 to 10 range parameters
param_grid = {'max_depth': range(1, 10)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.795454545455


In [140]:
#Changing our range from 10 to 5 had no effect on the final accuracy
param_grid = {'max_depth': range(1, 5)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.818181818182


In [141]:
#Increasing our range by 5 does not seem to making a impact, lets try something else. 
param_grid = {'max_depth': range(1, 15)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.818181818182


In [142]:
#Changing our first value did increase our accuracy by 3%
param_grid = {'max_depth': range(5, 15)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


In [143]:
#CHanging the parameters further does not seem to make a difference, so this is probably our highest accuracy
param_grid = {'max_depth': range(10, 30)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


In [144]:
#List of all the features
print enron_data.columns.values

['bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'email_address' 'exercised_stock_options' 'expenses' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi' 'loan_advances'
 'long_term_incentive' 'other' 'poi' 'restricted_stock'
 'restricted_stock_deferred' 'salary' 'shared_receipt_with_poi'
 'to_messages' 'total_payments' 'total_stock_value']


<h3> Naive Bayes Classifier</h3>

In [162]:
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


In [163]:
#First we start with all the features to see our precision and recall
features_list = ['poi', 'bonus', 'deferred_income', 'director_fees',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value']


#We have passed the mark for Recall but not precision
test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.75720	Precision: 0.25652	Recall: 0.43250	F1: 0.32204	F2: 0.38032
	Total predictions: 15000	True positives:  865	False positives: 2507	False negatives: 1135	True negatives: 10493



In [164]:
#Lets try removing 1 feature to see what happens. Lets pick director_fees
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value' ]


#Our Precision went up so we are on the right track
test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.78740	Precision: 0.27973	Recall: 0.37750	F1: 0.32134	F2: 0.35284
	Total predictions: 15000	True positives:  755	False positives: 1944	False negatives: 1245	True negatives: 11056



In [165]:
#Lets try removing another feature to see what happens. Lets pick "other" it should increase our score
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value' ]


#Our precision increased 2%, so close we need less than 1% to reach our mark.
test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.78933	Precision: 0.29374	Recall: 0.41300	F1: 0.34331	F2: 0.38198
	Total predictions: 15000	True positives:  826	False positives: 1986	False negatives: 1174	True negatives: 11014



In [151]:
#Maybe salary will help? Lets find out. 
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred',  'shared_receipt_with_poi', 'to_messages',
'total_payments', 'total_stock_value' ]


#Nope. Our precision went down, so we are now going in the wrong direction. lets add salary back.
test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.78587	Precision: 0.28201	Recall: 0.39200	F1: 0.32803	F2: 0.36364
	Total predictions: 15000	True positives:  784	False positives: 1996	False negatives: 1216	True negatives: 11004



In [152]:
#Maybe to_messages? Lets see
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]


#Yes! we do achieve a greater than 30% precision
test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.79600	Precision: 0.30741	Recall: 0.42300	F1: 0.35606	F2: 0.39342
	Total predictions: 15000	True positives:  846	False positives: 1906	False negatives: 1154	True negatives: 11094



In [153]:
#Final subset feature list for Naive Bayes
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]

test_classifier(clf, dataset, features_list, folds = 1000);

GaussianNB(priors=None)
	Accuracy: 0.79600	Precision: 0.30741	Recall: 0.42300	F1: 0.35606	F2: 0.39342
	Total predictions: 15000	True positives:  846	False positives: 1906	False negatives: 1154	True negatives: 11094



<h3>Decision Tree Classifier</h3>

In [157]:
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred, labels_test)

0.840909090909


In [158]:
#Running the same feature list for a Decision Tree Classifier, we get much different results
features_list = ['poi', 'bonus', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]

#Now, niether our Precision nor recall are above 30%
test_classifier(clf, dataset, features_list, folds = 1000);

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.81100	Precision: 0.28244	Recall: 0.27100	F1: 0.27660	F2: 0.27321
	Total predictions: 15000	True positives:  542	False positives: 1377	False negatives: 1458	True negatives: 11623



In [159]:
#Removing bonus we have recall above 30% and Precision less than 1% away.
features_list = ['poi', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]

test_classifier(clf, dataset, features_list, folds = 1000);

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.80787	Precision: 0.29602	Recall: 0.32000	F1: 0.30754	F2: 0.31490
	Total predictions: 15000	True positives:  640	False positives: 1522	False negatives: 1360	True negatives: 11478



In [160]:
#Now Removing restricted stock we are over 30% for both Recall and Precision
features_list = ['poi', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]

test_classifier(clf, dataset, features_list, folds = 1000);

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.81280	Precision: 0.32265	Recall: 0.36750	F1: 0.34362	F2: 0.35756
	Total predictions: 15000	True positives:  735	False positives: 1543	False negatives: 1265	True negatives: 11457



In [161]:
#Final Feature list for decision tree
features_list = ['poi', 'deferred_income',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
'total_payments', 'total_stock_value' ]


test_classifier(clf, dataset, features_list, folds = 1000);

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.81533	Precision: 0.32950	Recall: 0.37200	F1: 0.34946	F2: 0.36264
	Total predictions: 15000	True positives:  744	False positives: 1514	False negatives: 1256	True negatives: 11486



In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, dataset, features_list)