In [1]:
#!/usr/bin/python

import sys
import pickle
import numpy
import pandas as pd
sys.path.append("../")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
#features_list = ['poi','bonus','deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 
#                'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 
#                'long_term_incentive','other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 
#                'shared_receipt_with_poi','total_payments', 'total_stock_value', 'emailwithPOI', 
#                'short_term_interest', 'long_term_interest']

#final feature list
features_list = ['poi','bonus', 'exercised_stock_options', 'salary', 'total_stock_value', 'short_term_interest']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)


In [2]:
### Task 2: Remove outliers

#remove TOTAL from dataset
data_dict.pop('TOTAL',0)

#remove Lockhart from dataset since any data he has is NaN
data_dict.pop('LOCKHART EUGENE E',0)

#remove THE TRAVEL AGENCY IN THE PARK from dataset because this is not a person. According to enron61702insiderpay pdf document,
#this is a travel agency that books business travels for Enron employees
data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0)

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [3]:
#remove NaN value
import math

NaNlist = ['bonus','deferred_income','from_this_person_to_poi', 'from_messages', 'from_poi_to_this_person', 'to_messages',
          'salary', 'other', 'expenses', 'exercised_stock_options', 'long_term_incentive', 'restricted_stock']

for person in data_dict:
    for feature in NaNlist:
        if math.isnan(float(data_dict[person][feature])) == True:
            data_dict[person][feature] = 0

#Export cleaned dataset to Excel
#import pandas as pd
#df=pd.DataFrame.from_records(data_dict)
#df.to_excel('Enron_dataset_cleaned.xlsx', sheet_name='sheet1', index=True)


In [4]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

#create a new feature: emailwithPOI to measure the % of email a person sends to or receives from a POI over the total emails 
#that person sends or receives. The idea is that a person with more communication with a POI is more likely to collude 
#in the scandal
for person, datapoints in my_dataset.iteritems():
    if datapoints['from_messages'] == 0 or datapoints['to_messages'] == 0:
        datapoints['emailwithPOI'] = 0
    else:
        datapoints['emailwithPOI'] = float(((datapoints['from_this_person_to_poi'] * 100 / datapoints['from_messages']) + 
                                    (datapoints['from_poi_to_this_person'] *100 / datapoints['to_messages'])))

# short_term_interest is the new metric that combine all payments that an Enron employee can receive within a year. These 
# payments tend to inventivize the employee to focus on short-term success of the company to maximize payout
    datapoints['short_term_interest'] = (datapoints['salary'] + datapoints['bonus'] + datapoints['other']
                        + datapoints['expenses'] + datapoints['exercised_stock_options'])
# long_term_interest forces the employee to think about long-term success of the company as these incentives
# cannot be received right away
    datapoints['long_term_interest'] = (datapoints['long_term_incentive'] + datapoints['restricted_stock'])
        

In [5]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [6]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

# SVC (not used, takes too long)
#from sklearn.svm import SVC
#clf = SVC(C=1, kernel="linear")

# KNeighbors (also takes too long)
#from sklearn.neighbors import KNeighborsClassifier
#clf = KNeighborsClassifier(n_neighbors=3)

# DecisionTree
#from sklearn import tree
#clf = tree.DecisionTreeClassifier(min_samples_split=5)

# Random Forest
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators=20)

#Naive_Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [7]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### use KFold for split and validate algorithm
from sklearn.cross_validation import KFold
kf=KFold(len(labels),3)
for train_indices, test_indices in kf:
    #make training and testing sets
    features_train= [features[ii] for ii in train_indices]
    features_test= [features[ii] for ii in test_indices]
    labels_train=[labels[ii] for ii in train_indices]
    labels_test=[labels[ii] for ii in test_indices]    
    
# Use SlectKBest ad f_classif to chose the best features
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=5)
selector.fit(features_train, labels_train)
new_features_train = selector.transform(features_train)
new_features_test = selector.transform(features_test) 


In [8]:
# Print out the chosen features. Go back to feature_list and include only the chosen features
chosen_features = selector.get_support(indices=False)
testing_features = features_list[1:]
print pd.DataFrame(testing_features, chosen_features)

                            0
True                    bonus
True  exercised_stock_options
True                   salary
True        total_stock_value
True      short_term_interest


In [9]:
print selector.pvalues_
scores = selector.scores_
print scores
print pd.DataFrame(testing_features, scores)

import matplotlib.pyplot as plt
plt.bar(range(len(testing_features)), scores)
plt.xticks(range(len(testing_features)), testing_features, rotation='vertical')
plt.show()

[ 0.00084106  0.00264535  0.00253519  0.00154023  0.00074865]
[ 11.92874028   9.55575731   9.64221446  10.66461269  12.17471465]
                                 0
11.928740                    bonus
9.555757   exercised_stock_options
9.642214                    salary
10.664613        total_stock_value
12.174715      short_term_interest


In [10]:
from time import time
t0 = time()
clf.fit(new_features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t1 = time()
pred = clf.predict(new_features_test)
print "predicting time:", round(time()-t1, 3), "s"

accuracy = clf.score(new_features_test, labels_test)
print "accuracy is:", accuracy

from sklearn import metrics
print "Precision score is:", metrics.precision_score(labels_test, pred)
print "Recall score is:", metrics.recall_score(labels_test, pred)

from sklearn.metrics import confusion_matrix
print "Confusion Matrix"
confusion_matrix(labels_test, pred, labels=[1,0])

training time: 0.002 s
predicting time: 0.001 s
accuracy is: 0.913043478261
Precision score is: 0.6
Recall score is: 0.6
Confusion Matrix


array([[ 3,  2],
       [ 2, 39]])

In [11]:
#Decision tree: checking feature importance and updated feature list
#import pandas as pd
#testing_features = features_list[1:]
#print pd.DataFrame(testing_features, clf.feature_importances_).sort(columns=None, axis=0, ascending=False)

In [12]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)