# Identifying Fraud from Enron Financial Data and Email Metadata

In [1]:
import sys
import pickle
sys.path.append("tools/")

import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

import pandas as pd
pd.options.mode.use_inf_as_na = True
import numpy as np
import re
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

## Introduction

By 2002, Enron, one of America's most successful companies of the 1990s and 2000s, had been suspended from the New York Stock Exchange and was being investigated by the Department of Justice.  Once trading at a peak of \$90.75, Enron's shares plummeted to $0.26 on December 2, 2001, when it declared bankruptcy.  At the time, Enron’s bankruptcy was the largest in U.S. history.  As a result, many executives at Enron were indicted and sentenced to prison time.  Enron used high-risk accounting practices that were meant to make the organization look more successful that it actually was in order to defraud shareholders.   Enron used mark-to-market accounting which used projected future earnings to measure the value of an asset, and if its value did not meet predictions they would hide the financial losses by assigning them to a shell company.

The goal of this project is to determine from the financial and email metadata features in the data set which of the employees are persons of interest in the fraud case against Enron.  Persons of interest are defined as individuals who were indicted in the fraud, reached a settlement or plea deal with the government, or testified in exchange for immunity.

## Loading the Data

In [2]:
# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

## Data Exploration

In [3]:
# Load the data into a dataframe for exploration
df = pd.DataFrame.from_dict(data_dict).T
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442,1729541
BADUM JAMES P,,178980.0,,,,257817.0,3486,,,,...,,,False,,,,,,182466,257817
BANNANTINE JAMES M,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301,29.0,39.0,0.0,...,,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197,5243487
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343,10623258
BAY FRANKLIN R,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142,,,,...,,69.0,False,145796.0,-82782.0,239671.0,,,827696,63014


In [4]:
# Number of records
len(df)

146

In [5]:
# Replace 'NaN' in email_address field with empty string
df['email_address'].replace(to_replace='NaN', value = '', inplace=True)

# Replace all other 'NaN' with NaN 
df.replace(to_replace='NaN', value = np.nan, inplace=True)
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868.0,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0
BADUM JAMES P,,178980.0,,,,257817.0,3486.0,,,,...,,,False,,,,,,182466.0,257817.0
BANNANTINE JAMES M,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301.0,29.0,39.0,0.0,...,,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197.0,5243487.0
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343.0,10623258.0
BAY FRANKLIN R,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142.0,,,,...,,69.0,False,145796.0,-82782.0,239671.0,,,827696.0,63014.0


In [6]:
df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
df.dtypes

bonus                        float64
deferral_payments            float64
deferred_income              float64
director_fees                float64
email_address                 object
exercised_stock_options      float64
expenses                     float64
from_messages                float64
from_poi_to_this_person      float64
from_this_person_to_poi      float64
loan_advances                float64
long_term_incentive          float64
other                        float64
poi                             bool
restricted_stock             float64
restricted_stock_deferred    float64
salary                       float64
shared_receipt_with_poi      float64
to_messages                  float64
total_payments               float64
total_stock_value            float64
dtype: object

In [7]:
# Number of features
print(len(list(df.columns)))
target = 'poi'

21


There are a total of 21 fields, 14 financial features, 6 features concerning email metadata, and the target field poi.

**financial features**
* bonus
* deferral_payments
* deferred_income
* director_fees
* exercised_stock_options
* expenses
* loan_advances
* long_term_incentive
* other
* restricted_stock
* restricted_stock_deferred
* salary
* total_payments
* total_stock_value

**email features**
* email_address
* from_messages
* from_poi_to_this_person
* from_this_person_to_poi
* shared_receipt_with_poi
* to_messages

### Removing records

In [8]:
# Appropriate formats for names
name_fmt_1 = '\w+ \w+ \w'
name_fmt_2 = '\w+ \w+'

# Print names that do not follow either of these formats
[p for p in df.index if not (re.fullmatch(name_fmt_1, p) or re.fullmatch(name_fmt_2, p))]

['BLAKE JR. NORMAN P',
 'BOWEN JR RAYMOND M',
 'DERRICK JR. JAMES V',
 'DONAHUE JR JEFFREY M',
 'GARLAND C KEVIN',
 'GLISAN JR BEN F',
 'OVERDYKE JR JERE C',
 'PEREIRA PAULO V. FERRAZ',
 'SULLIVAN-SHAKLOVITZ COLLEEN',
 'THE TRAVEL AGENCY IN THE PARK',
 'TOTAL',
 'WALLS JR ROBERT H',
 'WHITE JR THOMAS E',
 'WINOKUR JR. HERBERT S',
 'YEAGER F SCOTT']

In [9]:
# Remove periods from names to make them consistent
df.index = df.index.map(lambda x: x.replace('.', ''))

[p for p in df.index if not (re.fullmatch(name_fmt_1, p) or re.fullmatch(name_fmt_2, p))]

['BLAKE JR NORMAN P',
 'BOWEN JR RAYMOND M',
 'DERRICK JR JAMES V',
 'DONAHUE JR JEFFREY M',
 'GARLAND C KEVIN',
 'GLISAN JR BEN F',
 'OVERDYKE JR JERE C',
 'PEREIRA PAULO V FERRAZ',
 'SULLIVAN-SHAKLOVITZ COLLEEN',
 'THE TRAVEL AGENCY IN THE PARK',
 'TOTAL',
 'WALLS JR ROBERT H',
 'WHITE JR THOMAS E',
 'WINOKUR JR HERBERT S',
 'YEAGER F SCOTT']

Apart from the names with 'JR', additional middle names, a first initial, or a hyphenated last name, which are also acceptable, there are the records 'TOTAL' and 'THE TRAVEL AGENCY IN THE PARK'.  Obviously these records were included in the data set in error since they do not represent employees.  Below, inspection of the data confirms that 'TOTAL' is an aggregate row of all employees and was erroneously included in the data.  These records will be removed from the data.

In [10]:
df['salary'].idxmax()

'TOTAL'

In [11]:
df['salary'].max() == df['salary'].sum() - df['salary'].max()

True

In [12]:
# Drop non-employee records
df.drop(['THE TRAVEL AGENCY IN THE PARK', 'TOTAL'], inplace = True)

I also chose to remove 'email_address' from the features since it cannot and will not be used to train the model.

In [13]:
# Remove email since it is basically equivalent to an ID and cannot be used in the model
df.drop('email_address', axis = 1, inplace = True)

### Missing Values

In [14]:
df.isna().sum()

bonus                         63
deferral_payments            106
deferred_income               96
director_fees                128
exercised_stock_options       43
expenses                      50
from_messages                 58
from_poi_to_this_person       58
from_this_person_to_poi       58
loan_advances                141
long_term_incentive           79
other                         53
poi                            0
restricted_stock              35
restricted_stock_deferred    127
salary                        50
shared_receipt_with_poi       58
to_messages                   58
total_payments                21
total_stock_value             19
dtype: int64

This data set has a lot of missing values, especially for many of the financial features.  Since there are so few records, it would not be appropriate to remove too many rows or columns from this data set.

In [15]:
df.isna().sum(axis = 1).sort_values(ascending = False)[:10]

LOCKHART EUGENE E         19
WROBEL BRUCE              17
WODRASKA JOHN             17
GRAMM WENDY L             17
WHALEY DAVID A            17
SCRIMSHAW MATTHEW         17
SAVAGE FRANK              16
GILLIS JOHN               16
WAKEHAM JOHN              16
CHRISTODOULOU DIOMEDES    16
dtype: int64

'LOCKHART EUGENE E' has all fields missing except for poi, so he has been removed from the data set.

In [16]:
df.drop('LOCKHART EUGENE E', inplace = True)

In [17]:
# Fill in missing values with 0
df = df.fillna(0)

Many of the financial features have a lot of missing values.  For instance, less than 15% of records have non-null values for the features ‘director_fees’, ‘loan_advances’, and ‘restricted_stock_deferred’.  Many of the nulls in the financial features are due to errors in data wrangling.  In the data source, many zero values are represented as ‘-‘, and were incorrectly interpreted as ‘NaN’.  Additionally, many of the features are aggregations of others, so using imputation to replace missing values would affect multiple features.  Hence, I chose to replace all missing values with zeros, except email addresses which were replaced with an empty string.

In [18]:
# Number of pois in the data set
df['poi'].sum()

18

There are only 18 positive instances out of 146 total records in the data set which means that this model will have to deal with class imbalance. 

## Feature Engineering

In [19]:
# These feature describes the proportion of communication that involves persons of interest
df['prop_poi_communication'] = (df['from_poi_to_this_person'] + df['from_this_person_to_poi']) / \
                               (df['to_messages'] + df['from_messages'])
df['shared_receipt_with_poi_prop'] = df['shared_receipt_with_poi'] / (df['to_messages'] + df['from_messages'])

# These features describe the proportion of total_payments that each individual feature is
df['bonus_prop'] = df['bonus'] / df['total_payments']
df['other_prop'] = df['other'] / df['total_payments']
df['expenses_prop'] = df['expenses'] / df['total_payments']

# These features describe the proportion of total_stock_value that each individual feature is
df['exercised_stock_options_prop'] = df['exercised_stock_options'] / df['total_stock_value']
df['restricted_stock_prop'] = df['restricted_stock'] / df['total_stock_value']

# Get new features list
all_features = [feat for feat in list(df.columns) if feat != 'poi']
all_features

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value',
 'prop_poi_communication',
 'shared_receipt_with_poi_prop',
 'bonus_prop',
 'other_prop',
 'expenses_prop',
 'exercised_stock_options_prop',
 'restricted_stock_prop']

In [20]:
# After possible division by zero fill inf and -inf with 0
df = df.fillna(0)

## Feature Selection

In [21]:
feature_selector = SelectKBest(f_classif, k='all').fit(df[all_features], df[target])
feat_scores = list(zip(feature_selector.scores_, df[all_features].columns[feature_selector.get_support()]))
feat_scores.sort(key=lambda tup: tup[0], reverse = True)
feat_scores

[(24.815079733218194, 'exercised_stock_options'),
 (24.182898678566879, 'total_stock_value'),
 (20.792252047181535, 'bonus'),
 (20.715596247559954, 'bonus_prop'),
 (18.289684043404513, 'salary'),
 (11.458476579280369, 'deferred_income'),
 (9.9221860131898225, 'long_term_incentive'),
 (9.736968287824979, 'shared_receipt_with_poi_prop'),
 (9.2128106219771002, 'restricted_stock'),
 (8.7727777300916756, 'total_payments'),
 (8.589420731682381, 'shared_receipt_with_poi'),
 (7.1840556582887247, 'loan_advances'),
 (6.0941733106389453, 'expenses'),
 (5.399370288094401, 'prop_poi_communication'),
 (5.2434497133749582, 'from_poi_to_this_person'),
 (4.1874775069953749, 'other'),
 (2.3826121082276739, 'from_this_person_to_poi'),
 (2.1263278020077054, 'director_fees'),
 (1.6463411294420076, 'to_messages'),
 (1.0901571696328429, 'restricted_stock_prop'),
 (1.0684733217820632, 'other_prop'),
 (0.22461127473600989, 'deferral_payments'),
 (0.16970094762175533, 'from_messages'),
 (0.065499652909942141, '

In [22]:
# List of the features in order of their importance
feats_sorted = [tup[1] for tup in feat_scores]

## Building the Model

In [23]:
# Generic classification function that returns training and validation metrics
def classify(model, data, features, target):

    skf = StratifiedShuffleSplit(n_splits=10, random_state = 42)
    
    tr_metrics = {}
    val_metrics = {}
    
    metric_names = ['acc', 'prec', 'recall', 'f1']

    for name in metric_names:
            tr_metrics[name] = []
            val_metrics[name] = []
            
    for train, test in skf.split(data[features], data[target]):
        X_tr = (data[features].iloc[train,:])
        y_tr = data[target].iloc[train]
        X_val = data[features].iloc[test,:]
        y_val = data[target].iloc[test]

        # Train the model and record metrics
        model.fit(X_tr, y_tr)
        pred_tr = model.predict(X_tr)
            
        tr_metrics['acc'].append(accuracy_score(pred_tr, y_tr))
        tr_metrics['prec'].append(precision_score(pred_tr, y_tr))
        tr_metrics['recall'].append(recall_score(pred_tr, y_tr))
        tr_metrics['f1'].append(f1_score(pred_tr, y_tr))

        # Make predictions on the validation set and record metrics
        pred = model.predict(X_val)
        
        val_metrics['acc'].append(accuracy_score(pred, y_val))
        val_metrics['prec'].append(precision_score(pred, y_val))
        val_metrics['recall'].append(recall_score(pred, y_val))
        val_metrics['f1'].append(f1_score(pred, y_val))
    
    tr_mean_metrics = []
    val_mean_metrics = []

    # Only keep averages
    for name in metric_names:
        tr_mean_metrics.append(np.mean(tr_metrics[name]))
        val_mean_metrics.append(np.mean(val_metrics[name]))
        
    return tr_mean_metrics, val_mean_metrics

In [24]:
# Run the classifier for 1:num_feat number of features
def model_by_num_feat(model, data, features, target, num_feat):
    tr_metrics_feat = {}
    val_metrics_feat = {}
    for i in range(1, num_feat + 1):
        tr_metrics, val_metrics = classify(model, data, features[:i], target)
        tr_metrics_feat[i] = tr_metrics
        val_metrics_feat[i] = val_metrics

    tr_metrics_df = pd.DataFrame(tr_metrics_feat).T
    tr_metrics_df.columns = ['tr_accuracy', 'tr_precision', 'tr_recall', 'tr_f1']

    val_metrics_df = pd.DataFrame(val_metrics_feat).T
    val_metrics_df.columns = ['cv_accuracy', 'cv_precision', 'cv_recall', 'cv_f1']
    
    metrics_df = pd.concat([tr_metrics_df, val_metrics_df], axis = 1)
    return metrics_df[['tr_accuracy', 'cv_accuracy', 'tr_precision', 'cv_precision',
                       'tr_recall', 'cv_recall', 'tr_f1', 'cv_f1']]

### Naive Bayes

In [25]:
clf_nb = GaussianNB()
metrics_nb_df = model_by_num_feat(clf_nb, df, feats_sorted, target, num_feat = 10)
metrics_nb_df

Unnamed: 0,tr_accuracy,cv_accuracy,tr_precision,cv_precision,tr_recall,cv_recall,tr_f1,cv_f1
1,0.877344,0.86,0.26875,0.35,0.518056,0.45,0.352939,0.366667
2,0.863281,0.846667,0.26875,0.35,0.425581,0.383333,0.328795,0.34
3,0.864844,0.866667,0.31875,0.45,0.442646,0.4,0.36994,0.403333
4,0.864844,0.866667,0.31875,0.45,0.442646,0.4,0.36994,0.403333
5,0.864844,0.866667,0.31875,0.45,0.445201,0.4,0.370643,0.406667
6,0.872656,0.873333,0.3875,0.45,0.48663,0.416667,0.431166,0.416667
7,0.86875,0.866667,0.39375,0.45,0.470128,0.4,0.428086,0.403333
8,0.86875,0.866667,0.39375,0.45,0.470128,0.4,0.428086,0.403333
9,0.870313,0.873333,0.41875,0.45,0.479487,0.416667,0.446233,0.413333
10,0.860156,0.84,0.38125,0.25,0.435447,0.25,0.405139,0.233333


Using the top 9 features yields a model with the highest recall and precision, with high accuracy that does not overfit.

### Random Forest

#### Feature tuning: passing in all features

In [26]:
# Passing in all features and parameter tuning
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 11)]
criterion = ['gini', 'entropy']
max_features = [2, 4, 6, 10, 20, 'auto', 'log2', None]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'bootstrap': bootstrap,
              'n_jobs': [-1]}

In [27]:
grid_search_all = GridSearchCV(RandomForestClassifier(), param_grid, scoring='recall_macro')
grid_search_clf_all = grid_search_all.fit(df[feats_sorted], df[target])
print("Best estimator found by grid search:")
print(grid_search_clf_all.best_estimator_)

Best estimator found by grid search:
RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=None, max_features=20,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)


In [28]:
clf_rf_all = RandomForestClassifier(**grid_search_clf_all.best_params_)
metrics_rf_df_all = classify(clf_rf_all, df, feats_sorted, target)
metrics_rf_df_all

([1.0, 1.0, 1.0, 1.0],
 [0.84000000000000008, 0.25, 0.19999999999999998, 0.22000000000000003])

#### Feature tuning: passing in top three features from SelectKBest

In [29]:
# Only passing in top 3 features from SelectKBest to parameter tuning
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 11)]
criterion = ['gini', 'entropy']
max_features = ['auto', 'log2', None]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'bootstrap': bootstrap,
              'n_jobs': [-1]}

In [30]:
grid_search_3 = GridSearchCV(RandomForestClassifier(), param_grid, scoring='recall_macro')
grid_search_clf_3 = grid_search_3.fit(df[feats_sorted[:3]], df[target])
print("Best estimator found by grid search:")
print(grid_search_clf_3.best_estimator_)

Best estimator found by grid search:
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [31]:
clf_rf_3 = RandomForestClassifier(**grid_search_clf_3.best_params_)
metrics_rf_df_3 = classify(clf_rf_3, df, feats_sorted[:3], target)
metrics_rf_df_3

([1.0, 1.0, 1.0, 1.0],
 [0.88000000000000012,
  0.55000000000000004,
  0.57499999999999996,
  0.53333333333333333])

Note: Although this is overfit, I played with the parameters and could not get better cross validation scores than these.

#### Feature tuning: passing in top four features from SelectKBest

In [32]:
# Only passing in top 4 features from SelectKBest to parameter tuning
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 11)]
criterion = ['gini', 'entropy']
max_features = ['auto', 'log2', None]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'bootstrap': bootstrap,
              'n_jobs': [-1]}

In [33]:
grid_search_4 = GridSearchCV(RandomForestClassifier(), param_grid, scoring='recall_macro')
grid_search_clf_4 = grid_search_4.fit(df[feats_sorted[:4]], df[target])
print("Best estimator found by grid search:")
print(grid_search_clf_4.best_estimator_)

Best estimator found by grid search:
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [34]:
clf_rf_4 = RandomForestClassifier(**grid_search_clf_4.best_params_)
metrics_rf_df_4 = classify(clf_rf_4, df, feats_sorted[:4], target)
metrics_rf_df_4

([1.0, 1.0, 1.0, 1.0],
 [0.87999999999999989,
  0.40000000000000002,
  0.55000000000000004,
  0.4366666666666667])

#### Feature tuning: passing in top five features from SelectKBest

In [35]:
# Only passing in top 4 features from SelectKBest to parameter tuning
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 11)]
criterion = ['gini', 'entropy']
max_features = ['auto', 'log2', None]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'bootstrap': bootstrap,
              'n_jobs': [-1]}

In [36]:
grid_search_5 = GridSearchCV(RandomForestClassifier(), param_grid, scoring='recall_macro')
grid_search_clf_5 = grid_search_5.fit(df[feats_sorted[:5]], df[target])
print("Best estimator found by grid search:")
print(grid_search_clf_5.best_estimator_)

Best estimator found by grid search:
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [37]:
clf_rf_5 = RandomForestClassifier(**grid_search_clf_5.best_params_)
metrics_rf_df_5 = classify(clf_rf_5, df, feats_sorted[:5], target)
metrics_rf_df_5

([1.0, 1.0, 1.0, 1.0],
 [0.87999999999999989, 0.40000000000000002, 0.5, 0.42000000000000004])

## Final Evaluation

In [38]:
# Make data into a dictionary again to pass to tester
df_dict = df.to_dict('index')

### Naive Bayes

In [39]:
# Best Naive Bayes model
chosen_feats = ['poi']
chosen_feats.extend(feats_sorted[:9]) # add poi as the first feature
test_classifier(clf_nb, df_dict, chosen_feats, folds = 1000)

GaussianNB(priors=None)
	Accuracy: 0.85393 	Precision: 0.48617 	Recall: 0.39550 	F1: 0.43617 	F2: 0.41082
	Total predictions: 14000                         	True positives:  791	False positives:  836                        	False negatives: 1209	True negatives: 11164



### Random Forest

In [40]:
# 3 features
chosen_feats = ['poi']
chosen_feats.extend(feats_sorted[:3])
test_classifier(clf_rf_3, df_dict, chosen_feats, folds = 1000)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.84869 	Precision: 0.50923 	Recall: 0.45500 	F1: 0.48059 	F2: 0.46490
	Total predictions: 13000                         	True positives:  910	False positives:  877                        	False negatives: 1090	True negatives: 10123



In [41]:
# 4 features
chosen_feats = ['poi']
chosen_feats.extend(feats_sorted[:4])
test_classifier(clf_rf_4, df_dict, chosen_feats, folds = 1000)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.84892 	Precision: 0.51200 	Recall: 0.38400 	F1: 0.43886 	F2: 0.40421
	Total predictions: 13000                         	True positives:  768	False positives:  732                        	False negatives: 1232	True negatives: 10268



In [42]:
# 5 features
chosen_feats = ['poi']
chosen_feats.extend(feats_sorted[:5])
test_classifier(clf_rf_5, df_dict, chosen_feats, folds = 1000)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.84554 	Precision: 0.49715 	Recall: 0.34850 	F1: 0.40976 	F2: 0.37067
	Total predictions: 13000                         	True positives:  697	False positives:  705                        	False negatives: 1303	True negatives: 10295



In [43]:
# All features
chosen_feats = ['poi']
chosen_feats.extend(feats_sorted)
test_classifier(clf_rf_all, df_dict, chosen_feats, folds = 1000)

RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=None, max_features=20,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
	Accuracy: 0.81333 	Precision: 0.23719 	Recall: 0.18050 	F1: 0.20500 	F2: 0.18956
	Total predictions: 15000                         	True positives:  361	False positives: 1161                        	False negatives: 1639	True negatives: 11839



### Best Model

In [44]:
clf_best = clf_rf_3

In [45]:
# Feature importances
list(zip(chosen_feats[1:], clf_best.feature_importances_))

[('exercised_stock_options', 0.31165478901033927),
 ('total_stock_value', 0.33013998510720011),
 ('bonus', 0.3582052258824609)]

## Dump Classifier

In [46]:
# Dump final chosen classifier
dump_classifier_and_data(clf_best, data_dict, chosen_feats)