# Enron Fraud

### Initital Load
#### Import packages:

In [37]:
import sys
import pickle
import numpy as np
from pandas import DataFrame
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

#### Load dataset and create dataframe

In [20]:
features_email = ['to_messages', 'from_messages',  'from_poi_to_this_person',
           'from_this_person_to_poi', 'shared_receipt_with_poi']
# finance data
features_finance = ['salary', 'bonus', 'long_term_incentive', 'deferred_income',
             'deferral_payments', 'loan_advances', 'other', 'expenses',
             'director_fees', 'total_payments',
             'exercised_stock_options', 'restricted_stock',
             'restricted_stock_deferred', 'total_stock_value']
# all features
features_list = features_email + features_finance
# all features column names
features_column_names = ['poi'] + ['email_address'] + features_email + features_finance
# all features data type
features_dtype = [bool] + [str] + list(np.repeat(float, 19))

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# converting the data into a data frame
df = DataFrame.from_dict(data_dict, orient='index')

# reordering the columns
df = df.loc[:, features_column_names]

# converting the data type
for i in xrange(len(features_column_names)):
    df[features_column_names[i]] = df[features_column_names[i]].astype(features_dtype[i], errors='ignore')

### Descriptive analysis & wrangling of dataset

#### Replace invalid datapoints

In [31]:
# Remove invalid data points that were discovered during the EDA in the course
df = df[df.index != 'TOTAL']
df = df[df.index != 'THE TRAVEL AGENCY IN THE PARK']

#### Shape of the table:

In [32]:
#Dataset Shape:
df.shape

(144, 24)

#### Share of null values per feature:

In [33]:
print df.isnull().sum() / df.shape[0]

poi                              0.000000
email_address                    0.000000
to_messages                      0.402778
from_messages                    0.402778
from_poi_to_this_person          0.402778
from_this_person_to_poi          0.402778
shared_receipt_with_poi          0.402778
salary                           0.000000
bonus                            0.000000
long_term_incentive              0.000000
deferred_income                  0.000000
deferral_payments                0.000000
loan_advances                    0.000000
other                            0.000000
expenses                         0.000000
director_fees                    0.000000
total_payments                   0.000000
exercised_stock_options          0.000000
restricted_stock                 0.000000
restricted_stock_deferred        0.000000
total_stock_value                0.000000
recieved_from_poi_ratio          0.402778
sent_to_poi_ratio                0.402778
shared_receipt_with_poi_ratio    0

#### Positional Parameters for further analysis of outliers
Defined as outliers above Q3 + 1.5 IQR and Q3 + 3 IQR, respectively.

In [34]:
print df.describe()

        to_messages  from_messages  from_poi_to_this_person  \
count     86.000000      86.000000                86.000000   
mean    2073.860465     608.790698                64.895349   
std     2582.700981    1841.033949                86.979244   
min       57.000000      12.000000                 0.000000   
25%      541.250000      22.750000                10.000000   
50%     1211.000000      41.000000                35.000000   
75%     2634.750000     145.500000                72.250000   
max    15149.000000   14368.000000               528.000000   

       from_this_person_to_poi  shared_receipt_with_poi        salary  \
count                86.000000                86.000000  1.440000e+02   
mean                 41.232558              1176.465116  1.854460e+05   
std                 100.073111              1178.317641  1.970421e+05   
min                   0.000000                 2.000000  0.000000e+00   
25%                   1.000000               249.750000  0.000000e+

#### Correlation coefficients:

In [35]:
print df.corr()

                                    poi  to_messages  from_messages  \
poi                            1.000000     0.058954      -0.074308   
to_messages                    0.058954     1.000000       0.475450   
from_messages                 -0.074308     0.475450       1.000000   
from_poi_to_this_person        0.167722     0.525667       0.186708   
from_this_person_to_poi        0.112940     0.568506       0.588687   
shared_receipt_with_poi        0.228313     0.847990       0.230855   
salary                         0.340120     0.299270       0.080212   
bonus                          0.359381     0.451249       0.118990   
long_term_incentive            0.257361     0.128052       0.016347   
deferred_income               -0.274762    -0.071725       0.014023   
deferral_payments             -0.039067     0.068668      -0.000588   
loan_advances                  0.220295     0.093903      -0.034957   
other                          0.170395     0.084481      -0.074146   
expens

### Feature engineering:
#### Ratio of email & receipt sharing with poi's

In [36]:
# calculate ratio
df['recieved_from_poi_ratio'] = df['from_poi_to_this_person'] / df['to_messages']
df['sent_to_poi_ratio'] = df['from_this_person_to_poi'] / df['from_messages']
df['shared_receipt_with_poi_ratio'] = df['shared_receipt_with_poi'] / df['to_messages']
# add labels to df
features_email_new = ['recieved_from_poi_ratio', 'sent_to_poi_ratio', 'shared_receipt_with_poi_ratio']
features_all = features_list + features_email_new

## References
1. Scikit-learn: How to obtain True Positive, True Negative, False Positive and False Negative - https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
2. GitHub: watanabe8760/uda-da-p5-enron-fraud-detection - https://github.com/watanabe8760/uda-da-p5-enron-fraud-detection
3. Interquartile range - https://en.wikipedia.org/wiki/Interquartile_range
4. stackoverflow: Pandas: Replacement for .ix  - https://stackoverflow.com/questions/43838999/pandas-replacement-for-ix
5. GitHub: Markdown Cheatsheet - https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet
