# Import Required Libraries

In [35]:
# import required libraries
import sys
import pickle
import pandas as pd
from time import time
from tester import dump_classifier_and_data
from feature_format import featureFormat, targetFeatureSplit

In [36]:
# import modules for feature scaling and selection
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Data Structure

In [37]:
features_list = ['poi', 'salary', 'bonus', 'long_term_incentive', 'bonus-to-salary_ratio', 'deferral_payments', 'expenses',
                 'restricted_stock_deferred', 'restricted_stock', 'deferred_income','ratio_from', 'total_payments',
                 'other', 'ratio_to', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages',
                 'from_messages', 'shared_receipt_with_poi', 'loan_advances', 'director_fees', 'exercised_stock_options',
                 'total_stock_value', 'restricted_stock']

As part of the preprocessing the financial data from Enron has been combined into a dictionary, where each key-value pair corresponds to one person, the dictionary key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person.

In [38]:
# load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as pickle_file:
    data_dict = pickle.load(pickle_file)

# convert the given pickled data to a pandas dataframe
dataframe = pd.DataFrame.from_records(list(data_dict.values()))

# set the index of the dataframe to be the person name
names = pd.Series(list(data_dict.keys()))
dataframe.set_index(names, inplace = True)

In [39]:
dataframe.head(5)

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
METTS MARK,600000.0,,,,mark.metts@enron.com,,94299.0,29.0,38.0,1.0,...,,1740.0,False,585062,,365788.0,702.0,807.0,1061827.0,585062
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,1586055.0,2660303.0,False,3942714,,267102.0,,,5634343.0,10623258
ELLIOTT STEVEN,350000.0,,-400729.0,,steven.elliott@enron.com,4890344.0,78552.0,,,,...,,12961.0,False,1788391,,170941.0,,,211725.0,6678735
CORDES WILLIAM R,,,,,bill.cordes@enron.com,651850.0,,12.0,10.0,0.0,...,,,False,386335,,,58.0,764.0,,1038185
HANNON KEVIN P,1500000.0,,-3117011.0,,kevin.hannon@enron.com,5538001.0,34039.0,32.0,32.0,21.0,...,1617011.0,11350.0,True,853064,,243293.0,1035.0,1045.0,288682.0,6391065


In [40]:
# get total number of employees in the dataset
len(dataframe)

146

In [41]:
# get the number of POI in the dataset
dataframe.poi.sum()

18

In [42]:
# convert inputs to numerics and return NaN when the input cannot be converted to a number then change NaN to zero
final_df = dataframe.apply(lambda x : pd.to_numeric(x, errors = 'coerce')).copy().fillna(0)

In [43]:
# dropp email_address column as not required in analysis
final_df.drop('email_address', axis = 1, inplace = True)

In [44]:
# find the outlier with the highest salary
final_df['salary'].sort_values(ascending = False)[0:5]

TOTAL                 26704229.0
SKILLING JEFFREY K     1111258.0
LAY KENNETH L          1072321.0
FREVERT MARK A         1060932.0
PICKERING MARK R        655037.0
Name: salary, dtype: float64

In [45]:
# remove outliers
final_df.drop(['TOTAL', 'SKILLING JEFFREY K', 'LAY KENNETH L', 'FREVERT MARK A', 'PICKERING MARK R'], axis = 0, inplace = True)

# Feature Engineering

In [46]:
# craete new features for a richer view of the whole picture
final_df['bonus-to-salary_ratio'] = final_df['bonus']/final_df['salary']
final_df['ratio_from'] = final_df['from_poi_to_this_person']/final_df['from_messages']
final_df['ratio_to'] = final_df['from_this_person_to_poi']/final_df['to_messages']

These features will reveal the persons with higher communications with POI

In [47]:
# clean 'inf' values which we got if the person's from_messages = 0
final_df = final_df.replace('inf', 0)
final_df = final_df.fillna(0)

In [48]:
# convert the above modified dataframe to a dictionary
final_dict = final_df.to_dict('index')

In [49]:
len(final_df)

141

In [51]:
# convert the dictionary to a numpy array
data = featureFormat(final_dict, features_list, sort_keys = True)
# separate the labels from the features
labels, features = targetFeatureSplit(data)

The line above (targetFeatureSplit) assumes that the label is the first item in feature_list so it's very important for POI to be listed first!

In [52]:
# split data into training and testing datasets
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, labels, test_size = 0.3,  random_state = 42)

The line above returns four sets of features and labels for both training and testing

In [53]:
# cross validator to return stratified randomized folds
cv_splitter = StratifiedShuffleSplit(n_splits = 100, test_size = 0.3,random_state = 42)

In [54]:
# define the function to be used via the pipeline
skb = SelectKBest(f_classif)

In [55]:
# tuning process with pipleine and GridSearchCV
pipeline = Pipeline(steps = [("SKB", skb), ("NaiveBayes", GaussianNB())])
param_grid = {"SKB__k":[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}

In [56]:
gnb_clf = GridSearchCV(pipeline, param_grid, verbose = 0, cv = cv_splitter, scoring = 'f1')

In [57]:
# show training time of the algorithm
t0 = time()
gnb_clf.fit(features, labels)
print "training time: ", round(time() - t0, 3), "seconds"

  f = msb / msw
  'precision', 'predicted', average, warn_for)


training time:  11.527 seconds


In [58]:
# best algorithm
clf = gnb_clf.best_estimator_

In [59]:
# refit the best algorithm
t0 = time()
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)

In [60]:
# show testing time of the algorithm
print "testing time: ", round(time() - t0, 3), "seconds"

testing time:  0.023 seconds


In [61]:
# print the key metrics of the algorithm
print "Accuracy of GaussianNB classifer is  : ",accuracy_score(labels_test, prediction)
print "Precision of GaussianNB classifer is : ",precision_score(prediction, labels_test)
print "Recall of GaussianNB classifer is    : ",recall_score(prediction, labels_test)
print "f1-score of GaussianNB classifer is  : ",f1_score(prediction, labels_test)

Accuracy of GaussianNB classifer is  :  0.8333333333333334
Precision of GaussianNB classifer is :  0.6
Recall of GaussianNB classifer is    :  0.375
f1-score of GaussianNB classifer is  :  0.4615384615384615
