In [1]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

# Task 1: 
Select what features you'll use.
features_list is a list of strings, each of which is a feature name.
The first feature must be "poi".

In [2]:
features = ['poi',
            'bonus',
            'deferral_payments',
            'deferred_income',
            'director_fees',
            'exercised_stock_options',
            'expenses',
            'from_messages',
            'from_poi_to_this_person',
            'from_this_person_to_poi',
            'loan_advances',
            'long_term_incentive',
            'other',
            'restricted_stock',
            'restricted_stock_deferred',
            'salary',
            'shared_receipt_with_poi',
            'to_messages',
            'total_payments',
            'total_stock_value',
            'relative_messages_from_poi',
            'relative_messages_to_poi']

features_list = features  # You will need to use more features

# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# Task 2
Remove outliers / data preparation

In [3]:
outliers = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']  # See notebook data exploration

for outlier in outliers:
    del data_dict[outlier]

# Task 3
Create new feature(s)

In [25]:
for employee in data_dict:
    to_messages = data_dict[employee]['to_messages']
    from_messages = data_dict[employee]['from_messages']
    from_poi_to_this_person = data_dict[employee]['from_poi_to_this_person']
    from_this_person_to_poi = data_dict[employee]['from_this_person_to_poi']

    # If either of the input features for the new feature is 'NaN',
    # the result for the new feature will also be 'NaN'
    if to_messages == 'NaN' or from_poi_to_this_person == 'NaN':
        data_dict[employee]['relative_messages_to_poi'] = 'NaN'
    if from_messages == 'NaN' or from_this_person_to_poi == 'NaN':
        data_dict[employee]['relative_messages_from_poi'] = 'NaN'

    # If both input features are not 'NaN', calculate the new features
    else:
        # Convert to float here, otherwise the check for 'NaN' above would not have worked correctly
        # (values would be the 'real' nan)
       
        to_messages = float(to_messages)
        from_messages = float(from_messages)
        from_poi_to_this_person = float(from_poi_to_this_person)
        from_this_person_to_poi = float(from_this_person_to_poi)

        # Calculate the new features
        data_dict[employee]['relative_messages_from_poi'] = from_poi_to_this_person/to_messages
        data_dict[employee]['relative_messages_to_poi'] = from_this_person_to_poi/from_messages

In [26]:
# Store to my_dataset for easy export below.
my_dataset = data_dict

In [42]:
# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)

(143L, 22L)

array([  0.00000000e+00,   4.17500000e+06,   2.86971700e+06,
        -3.08105500e+06,   0.00000000e+00,   1.72954100e+06,
         1.38680000e+04,   2.19500000e+03,   4.70000000e+01,
         6.50000000e+01,   0.00000000e+00,   3.04805000e+05,
         1.52000000e+02,   1.26027000e+05,  -1.26027000e+05,
         2.01955000e+05,   1.40700000e+03,   2.90200000e+03,
         4.48444200e+06,   1.72954100e+06,   1.61957271e-02,
         2.96127563e-02])

In [44]:
labels, features = targetFeatureSplit(data)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Task 4: 
Try a varity of classifiers
Please name your classifier clf for easy export below.
Note that if you want to do PCA or other multi-stage operations,
you'll need to use Pipelines. For more info:
http://scikit-learn.org/stable/modules/pipeline.html


In [29]:
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

# Task 5
Tune your classifier to achieve better than .3 precision and recall
using our testing script. Check the tester.py script in the final project
folder for details on the evaluation method, especially the test_classifier
function. Because of the small size of the dataset, the script uses
stratified shuffle split cross validation. For more info:
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

In [79]:
# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

features_train_sc, features_test_sc, labels_train_sc, labels_test_sc = \
    train_test_split(features, labels, test_size=0.3, random_state=42)    

# NB: if we only use financial features, we don't have to scale our features
from sklearn.feature_selection import f_classif, SelectKBest
selector = SelectKBest(score_func=f_classif, k=5)
selector = selector.fit(features_train, labels_train)
selector_sc = selector.fit(features_train_sc, labels_train_sc)

In [69]:
scores_dict = {}
pvalues_dict = {}

for i, feature in enumerate(features_list[1:]):
    scores_dict[feature] = selector.scores_[i]
    pvalues_dict[feature] = selector.pvalues_[i]

In [70]:
scores_dict_sc = {}

for i, feature in enumerate(features_list[1:]):
    scores_dict_sc[feature] = selector_sc.scores_[i]

In [71]:
scores_df = pd.DataFrame.from_dict(scores_dict, orient='index')
pvalues_df = pd.DataFrame.from_dict(pvalues_dict, orient='index')
scores_df_sc = pd.DataFrame.from_dict(scores_dict_sc, orient='index')

In [76]:
scores_df.sort_values(by=0, ascending=False)
pvalues_df.sort_values(by=0, ascending=True)

Unnamed: 0,0
bonus,30.728775
salary,15.858731
relative_messages_to_poi,15.838095
shared_receipt_with_poi,10.722571
total_stock_value,10.633852
exercised_stock_options,9.680041
total_payments,8.959137
deferred_income,8.792204
restricted_stock,8.058306
long_term_incentive,7.55512


Unnamed: 0,0
bonus,2.508124e-07
salary,0.0001311455
relative_messages_to_poi,0.0001323812
shared_receipt_with_poi,0.001463355
total_stock_value,0.001527956
exercised_stock_options,0.002440498
total_payments,0.003494029
deferred_income,0.00379922
restricted_stock,0.005506929
long_term_incentive,0.007124774


Are scores for scaled features the same as scores for non-scaled features? Apparently so.

In [63]:
all((scores_df == scores_df_sc)[0])

True

In [80]:
selected_features = selector.fit_transform(features_train, labels_train)

(100L, 5L)

# Task 6:
Dump your classifier, dataset, and features_list so anyone can
check your results. You do not need to change anything below, but make sure
that the version of poi_id.py that you submit can be run on its own and
generates the necessary .pkl files for validating your results.

In [None]:
dump_classifier_and_data(clf, my_dataset, features_list)