In [1]:
import sys
import pickle
import pandas as pd
from pprint import pprint
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.metrics import confusion_matrix, precision_score, recall_score

In [2]:
# Task 1: Select what features you'll use.
# features_list is a list of strings, each of which is a feature name.
# The first feature must be "poi".

"""
features = ['poi',
            'bonus',
            'deferral_payments',
            'deferred_income',
            'director_fees',
            'email_address',
            'exercised_stock_options',
            'expenses',
            'from_messages',
            'from_poi_to_this_person',
            'from_this_person_to_poi',
            'loan_advances',
            'long_term_incentive',
            'other',
            'restricted_stock',
            'restricted_stock_deferred',
            'salary',
            'shared_receipt_with_poi',
            'to_messages',
            'total_payments',
            'total_stock_value']
"""

features_list = ['poi',
            'bonus',
            'deferral_payments',
            'deferred_income',
            'director_fees',
            'exercised_stock_options',
            'expenses',
            'from_messages',
            'from_poi_to_this_person',
            'from_this_person_to_poi',
            'loan_advances',
            'long_term_incentive',
            'restricted_stock',
            'restricted_stock_deferred',
            'salary',
            'shared_receipt_with_poi',
            'to_messages',
            'total_payments',
            'total_stock_value',
                 'relative_messages_to_poi',
                 'relative_messages_from_poi']

# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# Task 2: Remove outliers / data preparation
outliers = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']  # See notebook data exploration

for outlier in outliers:
    del data_dict[outlier]

In [14]:
for employee in data_dict:
    to_messages = data_dict[employee]['to_messages']
    from_messages = data_dict[employee]['from_messages']
    from_poi_to_this_person = data_dict[employee]['from_poi_to_this_person']
    from_this_person_to_poi = data_dict[employee]['from_this_person_to_poi']
    
    if to_messages == 'NaN' or from_messages == 'NaN' or from_poi_to_this_person == 'NaN' or from_this_person_to_poi == 'NaN':
        data_dict[employee]['relative_messages_from_poi'] = 'NaN'
        data_dict[employee]['relative_messages_to_poi'] = 'NaN'
    else:
        to_messages = float(to_messages)
        from_messages = float(from_messages)
        from_poi_to_this_person = float(from_poi_to_this_person)
        from_this_person_to_poi = float(from_this_person_to_poi)
        data_dict[employee]['relative_messages_from_poi'] = from_poi_to_this_person/to_messages
        data_dict[employee]['relative_messages_to_poi'] = from_this_person_to_poi/from_messages

**Remaining issue: because some `to_message` entries are "NaN", the result of the division above will be the genuine NaN. These entries will not be converted to zero when handled by featureFormat().**

In [15]:
pd.DataFrame.from_dict(data_dict, orient='index').loc['BADUM JAMES P','relative_messages_to_poi']

'NaN'

In [282]:
# Task 3: Create new feature(s)
df_data = pd.DataFrame.from_dict(data_dict, orient='index')
numeric_columns = ['salary',
                   'to_messages',
                   'deferral_payments',
                   'total_payments',
                   'exercised_stock_options',
                   'bonus',
                   'restricted_stock',
                   'shared_receipt_with_poi',
                   'restricted_stock_deferred',
                   'total_stock_value',
                   'expenses',
                   'loan_advances',
                   'from_messages',
                   'other',
                   'from_this_person_to_poi',
                   'director_fees',
                   'deferred_income',
                   'long_term_incentive',
                   'from_poi_to_this_person']

df_data[numeric_columns] = df_data[numeric_columns].astype(float)
df_data['relative_messages_to_poi'] = df_data.from_this_person_to_poi/df_data.from_messages
df_data['relative_messages_from_poi'] = df_data.from_poi_to_this_person/df_data.to_messages
df_data = df_data[features_list]
df_data.fillna(0, inplace=True)

In [283]:
data_dict = df_data.to_dict(orient='index')

# Store to my_dataset for easy export below.
my_dataset = data_dict

In [284]:
# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [293]:
from sklearn.feature_selection import f_classif, SelectKBest
best_features = SelectKBest(f_classif, k=5).fit_transform(features, labels)

In [294]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(best_features, labels, test_size=0.3, random_state=42)

In [295]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
run_classifier(features_train, labels_train, features_test, labels_test, clf)

Accuracy: 0.860465116279
Precision: 0.5
Recall: 0.5
Confusion matrix:
array([[34,  3],
       [ 3,  3]])
Relative feature importances:
-


In [186]:
def run_classifier(features_train, labels_train, features_test, labels_test, clf):
    clf = clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    
    print 'Accuracy:', clf.score(features_test, labels_test)
    print 'Precision:', precision_score(labels_test, pred)
    print 'Recall:', recall_score(labels_test, pred)
    
    print 'Confusion matrix:'
    pprint(confusion_matrix(labels_test, pred))
    
    try:
        print 'Relative feature importances:'
        print clf.feature_importances_
    except:
        print '-'

# Naive Bayes classifier

In [187]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [188]:
run_classifier(features_train, labels_train, features_test, labels_test, clf)

Accuracy: 0.860465116279
Precision: 0.5
Recall: 0.5
Confusion matrix:
array([[34,  3],
       [ 3,  3]])
Relative feature importances:
-


# Support Vector Machine classifier

In [301]:
from sklearn.svm import SVC

params = {'kernel':['linear', 'rbf', 'sigmoid'],
          'C': [0.01, 0.1, 1.0, 10.0, 100.0],
          'gamma': [0.01, 0.1, 1.0, 10.0, 100.0]}

clf = SVC()
clf = GridSearchCV(clf, params)

In [None]:
clf.fit(features_train, labels_train)

# Random Forest classifier

In [272]:
from sklearn.ensemble import RandomForestClassifier

In [273]:
clf = RandomForestClassifier(random_state=42)
parameters = {'min_samples_split': [2, 5, 10, 20, 50, 100, 1000]}
clf = GridSearchCV(clf, parameters)
clf = clf.fit(features_train, labels_train)

In [275]:
clf.best_score_

0.88888888888888884

In [276]:
pred = clf.predict(features_test)

In [278]:
print recall_score(labels_test, pred)
print precision_score(labels_test, pred)

0.0
0.0


In [191]:
run_classifier(features_train, labels_train, features_test, labels_test, clf)

Accuracy: 0.837209302326
Precision: 0.0
Recall: 0.0
Confusion matrix:
array([[36,  1],
       [ 6,  0]])
Relative feature importances:
[ 0.19609696  0.04574428  0.          0.01246332  0.0996377   0.00315573
  0.14885773  0.23340473  0.12481834  0.1358212 ]


# AdaBoost Classifier

In [193]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()

In [194]:
run_classifier(features_train, labels_train, features_test, labels_test, clf)

Accuracy: 0.837209302326
Precision: 0.333333333333
Recall: 0.166666666667
Confusion matrix:
array([[35,  2],
       [ 5,  1]])
Relative feature importances:
[ 0.12  0.22  0.    0.06  0.06  0.14  0.1   0.08  0.04  0.18]


# Decision Tree classifier

In [266]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV

In [267]:
clf = DecisionTreeClassifier()

parameters = {'min_samples_split': [2, 5, 10, 20, 50, 100, 1000]}

clf = GridSearchCV(clf, parameters)
clf.fit(features_train, labels_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 5, 10, 20, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
clf.best_params_

{'min_samples_split': 100}

In [269]:
clf.best_score_

0.87878787878787878

In [None]:
pred = clf.predict()

In [None]:
from sklearn.feature_selection import 