In [1]:
import sys
import pickle
sys.path.append("tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

import pandas as pd
import re
import operator
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Introduction

## Loading the Data

In [2]:
# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

## Data Exploration

In [3]:
# Store for future testing
first_employee = 'ALLEN PHILLIP K'

In [4]:
len(data_dict[first_employee])

21

In [5]:
list(data_dict[first_employee].keys())

['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'email_address',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'from_poi_to_this_person',
 'exercised_stock_options',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'long_term_incentive',
 'shared_receipt_with_poi',
 'restricted_stock',
 'director_fees']

There are a total of 21 fields, 14 financial features, 6 features concerning email metadata, and the target field poi.

**financial features**
* bonus
* deferral_payments
* deferred_income
* director_fees
* exercised_stock_options
* expenses
* loan_advances
* long_term_incentive
* other
* restricted_stock
* restricted_stock_deferred
* salary
* total_payments
* total_stock_value

**email features**
* email_address
* from_messages
* from_poi_to_this_person
* from_this_person_to_poi
* shared_receipt_with_poi
* to_messages

In [6]:
target = 'poi'

email_features = ['email_address',
                  'from_messages',
                  'from_poi_to_this_person',
                  'from_this_person_to_poi',
                  'shared_receipt_with_poi',
                  'to_messages']

financial_features = ['bonus',
                      'deferral_payments',
                      'deferred_income',
                      'director_fees',
                      'exercised_stock_options',
                      'expenses',
                      'loan_advances',
                      'long_term_incentive',
                      'other',
                      'restricted_stock',
                      'restricted_stock_deferred',
                      'salary',
                      'total_payments',
                      'total_stock_value']

all_features = [target] + email_features + financial_features

In [7]:
# Number of records
len(data_dict)

146

In [8]:
people = list(data_dict.keys())
people[:10]

['METTS MARK',
 'BAXTER JOHN C',
 'ELLIOTT STEVEN',
 'CORDES WILLIAM R',
 'HANNON KEVIN P',
 'MORDAUNT KRISTINA M',
 'MEYER ROCKFORD G',
 'MCMAHON JEFFREY',
 'HAEDICKE MARK E',
 'PIPER GREGORY F']

In [9]:
# Appropriate formats for names
name_fmt_1 = '\w+ \w+ \w'
name_fmt_2 = '\w+ \w+'

# Print names that do not follow either of these formats
[p for p in people if not (re.fullmatch(name_fmt_1, p) or re.fullmatch(name_fmt_2, p))]

['WALLS JR ROBERT H',
 'SULLIVAN-SHAKLOVITZ COLLEEN',
 'BOWEN JR RAYMOND M',
 'OVERDYKE JR JERE C',
 'PEREIRA PAULO V. FERRAZ',
 'BLAKE JR. NORMAN P',
 'THE TRAVEL AGENCY IN THE PARK',
 'TOTAL',
 'WHITE JR THOMAS E',
 'WINOKUR JR. HERBERT S',
 'GARLAND C KEVIN',
 'YEAGER F SCOTT',
 'DERRICK JR. JAMES V',
 'DONAHUE JR JEFFREY M',
 'GLISAN JR BEN F']

In [10]:
# Remove periods from names to make them consistent
for key in data_dict.keys():
    data_dict[key.replace('.', '')] = data_dict.pop(key)
    
[p for p in data_dict.keys() if not (re.fullmatch(name_fmt_1, p) or re.fullmatch(name_fmt_2, p))]

['DERRICK JR JAMES V',
 'DONAHUE JR JEFFREY M',
 'GLISAN JR BEN F',
 'WALLS JR ROBERT H',
 'SULLIVAN-SHAKLOVITZ COLLEEN',
 'BOWEN JR RAYMOND M',
 'OVERDYKE JR JERE C',
 'PEREIRA PAULO V FERRAZ',
 'BLAKE JR NORMAN P',
 'THE TRAVEL AGENCY IN THE PARK',
 'TOTAL',
 'WHITE JR THOMAS E',
 'WINOKUR JR HERBERT S',
 'GARLAND C KEVIN',
 'YEAGER F SCOTT']

Apart from the names with 'JR', additional middle names, a first initial, or a hyphenated last name, which are also acceptable, there are the records 'TOTAL' and 'THE TRAVEL AGENCY IN THE PARK'.  Obviously these records were included in the data set in error since they do not represent employees.  Inspection of the data confirms that 'TOTAL' is an aggregate row of all employees and was erroneously included in the data.

In [52]:
# Number of pois in the data set
pois = [person for person in data_dict.keys() if data_dict[person]['poi']]
len(pois)

18

There are only 18 positive instances in the data set which means that this model will have to deal with class imbalance. 

### Missing Values

In [11]:
num_missing = {}
for feat in all_features:
    num_missing[feat] = len([person for person in data_dict.keys() if data_dict[person][feat] == 'NaN'])

In [12]:
num_missing

{'bonus': 64,
 'deferral_payments': 107,
 'deferred_income': 97,
 'director_fees': 129,
 'email_address': 35,
 'exercised_stock_options': 44,
 'expenses': 51,
 'from_messages': 60,
 'from_poi_to_this_person': 60,
 'from_this_person_to_poi': 60,
 'loan_advances': 142,
 'long_term_incentive': 80,
 'other': 53,
 'poi': 0,
 'restricted_stock': 36,
 'restricted_stock_deferred': 128,
 'salary': 51,
 'shared_receipt_with_poi': 60,
 'to_messages': 60,
 'total_payments': 21,
 'total_stock_value': 20}

In [16]:
num_missing_by_person = {}
for key in data_dict:
    num_missing_by_person[key] = len([field for field in data_dict[key].keys() if data_dict[key][field] == 'NaN'])
    
num_missing_by_person_sorted = sorted(num_missing_by_person.items(), key=operator.itemgetter(1), reverse=True)
num_missing_by_person_sorted[:10]

[('WHALEY DAVID A', 18),
 ('GRAMM WENDY L', 18),
 ('WROBEL BRUCE', 18),
 ('THE TRAVEL AGENCY IN THE PARK', 18),
 ('WODRASKA JOHN', 17),
 ('CLINE KENNETH W', 17),
 ('WAKEHAM JOHN', 17),
 ('SCRIMSHAW MATTHEW', 17),
 ('SAVAGE FRANK', 17),
 ('GILLIS JOHN', 17)]

'LOCKHART EUGENE E' has all fields missing except for poi, so he has been removed from the data set.

In [14]:
del data_dict['LOCKHART EUGENE E']

In [None]:
# Feature selection
features_list = ['poi','salary']

In [None]:
# Load the data into a dataframe
df = pd.DataFrame.from_dict(data_dict).T
print(len(list(df.columns)))
list(df.columns)

In [None]:
for feat in financial_features:
    df[feat] = df[feat].astype('float')

In [None]:
people = list(df.index)
people[:10]

In [None]:
# Remove periods from names to make them consistent
df.index = df.index.map(lambda x: x.replace('.', ''))

In [None]:
# Since there are a lot of nulls in the data set, I checked if there are employees with no data
def all_null(row):
    if df.iloc[row].is_na().all():
        print(row, 'YUP')
    
df.apply(all_null)