# Enron Fraud

## Initital Load
### Import packages:

In [None]:
import sys
import pickle
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("../tools/")
%matplotlib inline
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load dataset and create dataframe

In [None]:
features_email = ['to_messages', 'from_messages',  'from_poi_to_this_person',
           'from_this_person_to_poi', 'shared_receipt_with_poi']
# finance data
features_finance = ['salary', 'bonus', 'long_term_incentive', 'deferred_income',
             'deferral_payments', 'loan_advances', 'other', 'expenses',
             'director_fees', 'total_payments',
             'exercised_stock_options', 'restricted_stock',
             'restricted_stock_deferred', 'total_stock_value']
# all features
features_list = features_email + features_finance
# all features column names
features_column_names = ['poi'] + ['email_address'] + features_email + features_finance
# all features data type
features_dtype = [bool] + [str] + list(np.repeat(float, 19))

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# converting the data into a data frame
df = DataFrame.from_dict(data_dict, orient='index')

# reordering the columns
df = df.loc[:, features_column_names]

# converting the data type
for i in xrange(len(features_column_names)):
    df[features_column_names[i]] = df[features_column_names[i]].astype(features_dtype[i], errors='ignore')


### Add ratios of email & receipt sharing with poi's to df
The ratio of emails send/recieved to/by pois compared to all emails that were send. The total amount of mails send by pois

In [None]:
# calculate ratio
df['recieved_from_poi_ratio'] = df['from_poi_to_this_person'] / df['to_messages']
df['sent_to_poi_ratio'] = df['from_this_person_to_poi'] / df['from_messages']
df['shared_receipt_with_poi_ratio'] = df['shared_receipt_with_poi'] / df['to_messages']
# add labels to df
features_email_new = ['recieved_from_poi_ratio', 'sent_to_poi_ratio', 'shared_receipt_with_poi_ratio']
features_all = features_list + features_email_new

## Descriptive analysis & wrangling of dataset

### Shape of the table:

In [None]:
#Dataset Shape:
df.shape

The shape of the dataset shows that there are 146 rows (humans) and 24 columns (features) of which three are ratio that were added in the last section.

### Missing features

In [None]:
df_null_value_ratio = (df.isnull().sum() / df.shape[0]).sort_values(ascending=False)
df_null_values = (df.isnull().sum()).sort_values(ascending=False)
frames = [df_null_value_ratio, df_null_value_ratio]
print pd.concat(frames, axis=1, join_axes=[df1.index])

This is good news! Even though there are a bunch of features missing, there is a poi flag and email address for each person in the dataset. There are 18 pois that make 14% of the total dataset.

### POIs

In [None]:
# name of persons in dataset
people = data_dict.keys()
print('Total persons in dataset: %d' % len(people))
# count number of pois
POI_count = 0
for person in people:
    POI_count += data_dict[person]['poi']
# print pois
print('Number of POIs: %d' % POI_count)

POI's only make up for 14% of our dataset which is a skewed distribution that needs to be considered when evaluating the classification algos. If an algo such as POI = False would be deployed the accuracy would already be at 86%. Developing an algorithm with a accurady of 86% in less than 5 minutes does not sound too bad – but doesn't help in identifying the POIs at all.

### Positional Parameters
Defined as outliers above Q3 + 1.5 IQR and Q3 + 3 IQR, respectively.

In [None]:
print df.describe()

In [None]:
# Outlier Confirmation [Before data modification]
# sns.distplot(df['to_messages'].dropna())
sns.distplot(df['from_messages'].dropna())
# plt.hist(df['from_messages'].dropna())
# print df[df['from_messages'] > 2800]
# 
# plt.hist(df['from_poi_to_this_person'].dropna())
# print df[df['from_poi_to_this_person'] > 500]
# 
# plt.hist(df['from_this_person_to_poi'].dropna())
# print df[df['from_this_person_to_poi'] > 300]
# 
# plt.hist(df['shared_receipt_with_poi'].dropna())
# print df[df['shared_receipt_with_poi'] > 4000]
# 
# plt.hist(df['salary'])
# print df[df['salary'] > 2.4e+07]
# # -> [Modification] "TOTAL" is an invalid data point
# print df[df['salary'] > 1.0e+06]
# 
# plt.hist(df['bonus'])
# print df[df['bonus'] > 4.0e+06]
# 
# plt.hist(df['long_term_incentive'])
# print df[df['long_term_incentive'] > 2.0e+06]
# 
# plt.hist(df['deferred_income'])
# print df[df['deferred_income'] < -2.8e+06]
# print df['deferred_income'][df['deferred_income'] < -2.8e+06]
# 
# plt.hist(df['deferral_payments'])
# print df[df['deferral_payments'] < 0]
# # -> [Modification] BELFER ROBERT - miss-alignment of columns
# print df[df['deferral_payments'] > 5.7e+06]
# 
# plt.hist(df['loan_advances'])
# print df[df['loan_advances'] > 7.3e+07]
# plt.hist(df.loc[df['loan_advances'] < 7.3e+07, 'loan_advances'])
# print df[df['loan_advances'] > 4.0e+04]
# plt.hist(df.loc[df['loan_advances'] < 4.0e+04, 'loan_advances'])
# print sum(df['loan_advances'][df['loan_advances'] < 4.0e+04])
# # -> Only few people took loan advances, most of them are 0.
# 
# plt.hist(df['other'])
# print df[df['other'] > 7.0e+06]
# 
# plt.hist(df['expenses'])
# print df[df['expenses'] > 1.37e+05]
# 
# plt.hist(df['director_fees'])
# print df[df['director_fees'] > 1.0e+05]
# 
# plt.hist(df['total_payments'])
# print df[df['total_payments'] > 9.3e+07]
# 
# plt.hist(df['exercised_stock_options'])
# print df[df['exercised_stock_options'] > 2.7e+07]
# 
# plt.hist(df['restricted_stock'])
# print df[df['restricted_stock'] < 0]
# # -> [Modification] BHATNAGAR SANJAY - miss-alignment of columns
# print df[df['restricted_stock'] > 1.3e+07]
# 
# plt.hist(df['restricted_stock_deferred'])
# print df[df['restricted_stock_deferred'] < -1.5e+06]
# 
# plt.hist(df['total_stock_value'])
# print df[df['total_stock_value'] > 2.4e+07]

In [None]:
# Outlier Confirmation [Before data modification]

sns.distplot(df['to_messages'].dropna())
median = df['to_messages'].median()
iqr = df['to_messages'].quantile(0.75) - median
outliers = median + 1.5 * iqr
extreme_values = median + 3 * iqr
print df[df['to_messages'] > outliers]
print df[df['to_messages'] > extreme_values]
# print df[df['to_messages'] > 4500]
# print df[df['to_messages'] > 10000]
# 
# plt.hist(df['from_messages'].dropna())
# print df[df['from_messages'] > 2800]
# 
# plt.hist(df['from_poi_to_this_person'].dropna())
# print df[df['from_poi_to_this_person'] > 500]
# 
# plt.hist(df['from_this_person_to_poi'].dropna())
# print df[df['from_this_person_to_poi'] > 300]
# 
# plt.hist(df['shared_receipt_with_poi'].dropna())
# print df[df['shared_receipt_with_poi'] > 4000]
# 
# plt.hist(df['salary'])
# print df[df['salary'] > 2.4e+07]
# # -> [Modification] "TOTAL" is an invalid data point
# print df[df['salary'] > 1.0e+06]
# 
# plt.hist(df['bonus'])
# print df[df['bonus'] > 4.0e+06]
# 
# plt.hist(df['long_term_incentive'])
# print df[df['long_term_incentive'] > 2.0e+06]
# 
# plt.hist(df['deferred_income'])
# print df[df['deferred_income'] < -2.8e+06]
# print df['deferred_income'][df['deferred_income'] < -2.8e+06]
# 
# plt.hist(df['deferral_payments'])
# print df[df['deferral_payments'] < 0]
# # -> [Modification] BELFER ROBERT - miss-alignment of columns
# print df[df['deferral_payments'] > 5.7e+06]
# 
# plt.hist(df['loan_advances'])
# print df[df['loan_advances'] > 7.3e+07]
# plt.hist(df.loc[df['loan_advances'] < 7.3e+07, 'loan_advances'])
# print df[df['loan_advances'] > 4.0e+04]
# plt.hist(df.loc[df['loan_advances'] < 4.0e+04, 'loan_advances'])
# print sum(df['loan_advances'][df['loan_advances'] < 4.0e+04])
# # -> Only few people took loan advances, most of them are 0.
# 
# plt.hist(df['other'])
# print df[df['other'] > 7.0e+06]
# 
# plt.hist(df['expenses'])
# print df[df['expenses'] > 1.37e+05]
# 
# plt.hist(df['director_fees'])
# print df[df['director_fees'] > 1.0e+05]
# 
# plt.hist(df['total_payments'])
# print df[df['total_payments'] > 9.3e+07]
# 
# plt.hist(df['exercised_stock_options'])
# print df[df['exercised_stock_options'] > 2.7e+07]
# 
# plt.hist(df['restricted_stock'])
# print df[df['restricted_stock'] < 0]
# # -> [Modification] BHATNAGAR SANJAY - miss-alignment of columns
# print df[df['restricted_stock'] > 1.3e+07]
# 
# plt.hist(df['restricted_stock_deferred'])
# print df[df['restricted_stock_deferred'] < -1.5e+06]
# 
# plt.hist(df['total_stock_value'])
# print df[df['total_stock_value'] > 2.4e+07]

In [None]:
# Outlier Confirmation [Before data modification]

sns.distplot(df['to_messages'].dropna())
median = df['to_messages'].median()
iqr = df['to_messages'].quantile(0.75) - median
outliers = median + 1.5 * iqr
extreme_values = median + 3 * iqr
print df[df['to_messages'] > outliers]
print df[df['to_messages'] > extreme_values]
# print df[df['to_messages'] > 4500]
# print df[df['to_messages'] > 10000]
# 
# plt.hist(df['from_messages'].dropna())
# print df[df['from_messages'] > 2800]
# 
# plt.hist(df['from_poi_to_this_person'].dropna())
# print df[df['from_poi_to_this_person'] > 500]
# 
# plt.hist(df['from_this_person_to_poi'].dropna())
# print df[df['from_this_person_to_poi'] > 300]
# 
# plt.hist(df['shared_receipt_with_poi'].dropna())
# print df[df['shared_receipt_with_poi'] > 4000]
# 
# plt.hist(df['salary'])
# print df[df['salary'] > 2.4e+07]
# # -> [Modification] "TOTAL" is an invalid data point
# print df[df['salary'] > 1.0e+06]
# 
# plt.hist(df['bonus'])
# print df[df['bonus'] > 4.0e+06]
# 
# plt.hist(df['long_term_incentive'])
# print df[df['long_term_incentive'] > 2.0e+06]
# 
# plt.hist(df['deferred_income'])
# print df[df['deferred_income'] < -2.8e+06]
# print df['deferred_income'][df['deferred_income'] < -2.8e+06]
# 
# plt.hist(df['deferral_payments'])
# print df[df['deferral_payments'] < 0]
# # -> [Modification] BELFER ROBERT - miss-alignment of columns
# print df[df['deferral_payments'] > 5.7e+06]
# 
# plt.hist(df['loan_advances'])
# print df[df['loan_advances'] > 7.3e+07]
# plt.hist(df.loc[df['loan_advances'] < 7.3e+07, 'loan_advances'])
# print df[df['loan_advances'] > 4.0e+04]
# plt.hist(df.loc[df['loan_advances'] < 4.0e+04, 'loan_advances'])
# print sum(df['loan_advances'][df['loan_advances'] < 4.0e+04])
# # -> Only few people took loan advances, most of them are 0.
# 
# plt.hist(df['other'])
# print df[df['other'] > 7.0e+06]
# 
# plt.hist(df['expenses'])
# print df[df['expenses'] > 1.37e+05]
# 
# plt.hist(df['director_fees'])
# print df[df['director_fees'] > 1.0e+05]
# 
# plt.hist(df['total_payments'])
# print df[df['total_payments'] > 9.3e+07]
# 
# plt.hist(df['exercised_stock_options'])
# print df[df['exercised_stock_options'] > 2.7e+07]
# 
# plt.hist(df['restricted_stock'])
# print df[df['restricted_stock'] < 0]
# # -> [Modification] BHATNAGAR SANJAY - miss-alignment of columns
# print df[df['restricted_stock'] > 1.3e+07]
# 
# plt.hist(df['restricted_stock_deferred'])
# print df[df['restricted_stock_deferred'] < -1.5e+06]
# 
# plt.hist(df['total_stock_value'])
# print df[df['total_stock_value'] > 2.4e+07]

#### Correlation coefficients:

In [None]:
print df.corr()

## References

### Articles
* A look at those involved in the Enron scandal, USA Today - http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm
* The Immortal Life of the Enron E-mails, MIT Technology Review - https://www.technologyreview.com/s/515801/the-immortal-life-of-the-enron-e-mails/
* Implementing a Weighted Majority Rule Ensemble Classifier in scikit-learn, Sebastian Raschka - http://sebastianraschka.com/Articles/2014_ensemble_classifier.html
* Color Palettes in Seaborn, Chris Albon - http://chrisalbon.com/python/seaborn_color_palettes.html
* Random Forests, Leo Breiman and Adele Cutler - http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
* Python sklearn.feature_selection.f_classif Examples - http://www.programcreek.com/python/example/85917/sklearn.feature_selection.f_classif

### Cheatsheets
* Markdown - https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet
* Pandas - https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
* Numpy - https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf

### Documentation
* Pipelining: chaining a PCA and a logistic regression, scikit learn - http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html
* matplotlib.axes, matplotlib - http://matplotlib.org/api/axes_api.html
* DataFrame quantiles, pandas - http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.quantile.html
* Visualization, pandas - https://pandas.pydata.org/pandas-docs/stable/visualization.html
* pyplot, matplotlib - https://matplotlib.org/devdocs/api/_as_gen/matplotlib.pyplot.hist.html
* sort values, pandas - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html  

### GitHub Repositories
* EnsembleVoteClassifier, Sebastian Raschka - http://rasbt.github.io/mlxtend/user_guide/classifier/EnsembleVoteClassifier/
* Grace Pehl: Identifying Persons of Interest from the Enron Corpus - https://github.com/grace-pehl/enron
* brandjamie: Marchine Learning with the enron emails dataset - https://github.com/brandjamie/udacity_enron
* Daria ALekseeva: Enron Dataset - https://github.com/DariaAlekseeva/Enron_Dataset
* watanabe8760: uda-da-p5-enron-fraud-detection - https://github.com/watanabe8760/uda-da-p5-enron-fraud-detection
* Mayukh Sobo: Enron Fraud https://github.com/MayukhSobo/EnronFraud 

### Q&A pages
* Pandas Replacement for .ix, Stack Overflow  - https://stackoverflow.com/questions/43838999/pandas-replacement-for-ix
* Sci-kit and Regression Summary, Stack Overflow - http://stackoverflow.com/questions/26319259/sci-kit-and-regression-summary
* How to obtain True Positive, True Negative, False Positive and False Negative, Stack Overflow - https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
* Why do we need to normalize data before analysis, Cross Validated - http://stats.stackexchange.com/questions/69157/why-do-we-need-to-normalize-data-before-analysis
* Perform feature normalization before or within model validation?, Cross Validated - http://stats.stackexchange.com/questions/77350/perform-feature-normalization-before-or-within-model-validation
* How should the interquartile range be calculated in Python?, Stack Overflow - http://stackoverflow.com/questions/27472330/how-should-the-interquartile-range-be-calculated-in-python
* scikit learn svc coef0 parameter range, Stack Overflow - http://stackoverflow.com/questions/21390570/scikit-learn-svc-coef0-parameter-range
* What is a good range of values for the svm.SVC() hyperparameters to be explored via GridSearchCV()?, Stack Overflow - http://stackoverflow.com/questions/26337403/what-is-a-good-range-of-values-for-the-svm-svc-hyperparameters-to-be-explored
* Imputation before or after splitting into train and test?, Cross Validated - http://stats.stackexchange.com/questions/95083/imputation-before-or-after-splitting-into-train-and-test
* Is there a rule-of-thumb for how to divide a dataset into training and validation sets?, Stack Overflow - http://stackoverflow.com/questions/13610074/is-there-a-rule-of-thumb-for-how-to-divide-a-dataset-into-training-and-validatio
* What is the difference between test set and validation set?, Cross Validated - http://stats.stackexchange.com/questions/19048/what-is-the-difference-between-test-set-and-validation-set
* Python - What is exactly sklearn.pipeline.Pipeline?, Stack Overflow - http://stackoverflow.com/questions/33091376/python-what-is-exactly-sklearn-pipeline-pipeline
* How can I use a custom feature selection function in scikit-learn's pipeline, Stack Overflow - http://stackoverflow.com/questions/25250654/how-can-i-use-a-custom-feature-selection-function-in-scikit-learns-pipeline
* Seaborn distplot y-axis normalisation wrong ticklabels, Stack Overflow - http://stackoverflow.com/questions/32274865/seaborn-distplot-y-axis-normalisation-wrong-ticklabels
* How to save a Seaborn plot into a file, Stack Overflow - http://stackoverflow.com/questions/32244753/how-to-save-a-seaborn-plot-into-a-file
* Seaborn plots not showing up, Stack Overflow - https://stackoverflow.com/questions/26597116/seaborn-plots-not-showing-up

### Tools
* Markdown Tables Generator - http://www.tablesgenerator.com/markdown_tables
* JSON pretty print - http://jsonprettyprint.com

### Wikipedia
* Enron scandal - https://en.wikipedia.org/wiki/Enron_scandal
* Boxplots - https://en.wikipedia.org/wiki/Box_plot
* Interquartile range - https://en.wikipedia.org/wiki/Interquartile_range
* False positive rate - https://en.wikipedia.org/wiki/False_positive_rate
* False discovery rate - https://en.wikipedia.org/wiki/False_discovery_rate
* Precision and recall - https://en.wikipedia.org/wiki/Precision_and_recall