# Identify Fraud from Enron Email

In [1]:
import sys
import pickle
import numpy as np
import pandas as pd

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data


## dataset

In [2]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

enron_data = pd.DataFrame.from_dict(data_dict)

print(enron_data)

                                     METTS MARK BAXTER JOHN C  \
bonus                                    600000       1200000   
deferral_payments                           NaN       1295738   
deferred_income                             NaN      -1386055   
director_fees                               NaN           NaN   
email_address              mark.metts@enron.com           NaN   
exercised_stock_options                     NaN       6680544   
expenses                                  94299         11200   
from_messages                                29           NaN   
from_poi_to_this_person                      38           NaN   
from_this_person_to_poi                       1           NaN   
loan_advances                               NaN           NaN   
long_term_incentive                         NaN       1586055   
other                                      1740       2660303   
poi                                       False         False   
restricted_stock         

In [3]:
print(enron_data.loc[:, enron_data.loc['poi']==True].columns)
print('number of POI: ', len(enron_data.loc[:, enron_data.loc['poi']==True].columns))

Index(['HANNON KEVIN P', 'COLWELL WESLEY', 'RIEKER PAULA H',
       'KOPPER MICHAEL J', 'SHELBY REX', 'DELAINEY DAVID W', 'LAY KENNETH L',
       'BOWEN JR RAYMOND M', 'BELDEN TIMOTHY N', 'FASTOW ANDREW S',
       'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'SKILLING JEFFREY K',
       'YEAGER F SCOTT', 'HIRKO JOSEPH', 'KOENIG MARK E', 'CAUSEY RICHARD A',
       'GLISAN JR BEN F'],
      dtype='object')
number of POI:  18


In [4]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = list(data_dict['METTS MARK'].keys())
features_list.remove('poi')
features_list.remove('email_address')
features_list.remove('total_payments')
features_list.remove('total_stock_value')
features_list.remove('other')


In [5]:
### Task 2: Remove outliers
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')


{'salary': 'NaN',
 'to_messages': 'NaN',
 'deferral_payments': 'NaN',
 'total_payments': 362096,
 'loan_advances': 'NaN',
 'bonus': 'NaN',
 'email_address': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'deferred_income': 'NaN',
 'total_stock_value': 'NaN',
 'expenses': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'exercised_stock_options': 'NaN',
 'from_messages': 'NaN',
 'other': 362096,
 'from_this_person_to_poi': 'NaN',
 'poi': False,
 'long_term_incentive': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'restricted_stock': 'NaN',
 'director_fees': 'NaN'}

## remove NaN in the dataset (cleaning dataset is important)

In [6]:
# Remove columns with > 50% NaN's
df = pd.DataFrame(data_dict).T
df.replace(to_replace='NaN', value=np.nan, inplace=True)
for key in features_list:
    if df[key].isnull().sum() > df.shape[0] * 0.5:
        features_list.remove(key)
features_list = ['poi'] + features_list


In [7]:
### Store to my_dataset for easy export below.
my_dataset = data_dict


In [8]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


## feature scaling

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scl_features = scaler.fit_transform(features)


## train classifier

In [10]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
# clf = GaussianNB()

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV



### useful tool to tune hyperparameters of multiple classifiers
http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers?noredirect=1&lq=1

In [11]:
class EstimatorSelectionHelper:
    """
    http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
    https://stackoverflow.com/questions/23045318/scikit-grid-search-over-multiple-classifiers

    """    
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
    
    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df


In [12]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)



models1 = {
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC(),
    'MLPClassifier': MLPClassifier()
}

params1 = {
    'RandomForestClassifier': { 'n_estimators': [8, 16, 32] },
    'AdaBoostClassifier':  { 'n_estimators': [8, 16, 32, 50], 'learning_rate': [0.1, 0.5, 1.0]},
    'GradientBoostingClassifier': { 'n_estimators': [16, 32, 50], 'learning_rate': [0.8, 1.0] },
    'MLPClassifier': {'hidden_layer_sizes': (32,16), 'learning_rate_init': [0.001]},
    'SVC': [
        {'kernel': ['rbf'], 'C': [1, 10, 100]},
    ]
}

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(features_train, labels_train, scoring='f1', n_jobs=24, cv=10)

helper1.score_summary()

# print(summary)

Running GridSearchCV for RandomForestClassifier.




Running GridSearchCV for AdaBoostClassifier.




Running GridSearchCV for GradientBoostingClassifier.




Running GridSearchCV for SVC.




Running GridSearchCV for MLPClassifier.
Done.


Unnamed: 0,estimator,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,AdaBoostClassifier,0.009302,0.001614,0.001612,0.000397,"{'learning_rate': 0.5, 'n_estimators': 8}",0.666667,0.5,0.5,0.0,...,0.956522,0.818182,0.782609,0.857143,0.952381,0.761905,0.857143,0.8,0.840577,0.064912
1,AdaBoostClassifier,0.047113,0.01084,0.005006,0.00152,"{'learning_rate': 1.0, 'n_estimators': 50}",0.8,0.5,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,AdaBoostClassifier,0.030514,0.005651,0.003221,0.000916,"{'learning_rate': 1.0, 'n_estimators': 32}",1.0,0.8,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,AdaBoostClassifier,0.006931,0.000856,0.001372,0.000244,"{'learning_rate': 1.0, 'n_estimators': 8}",0.666667,0.666667,0.0,0.0,...,1.0,0.818182,0.909091,0.909091,0.956522,0.818182,0.956522,0.736842,0.878221,0.081706
4,AdaBoostClassifier,0.03819,0.003475,0.003815,0.000358,"{'learning_rate': 0.5, 'n_estimators': 50}",0.666667,0.8,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,GradientBoostingClassifier,0.006345,0.001125,0.000917,0.000221,"{'learning_rate': 1.0, 'n_estimators': 16}",0.0,0.4,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,GradientBoostingClassifier,0.015898,0.003089,0.001065,0.000237,"{'learning_rate': 0.8, 'n_estimators': 50}",0.0,0.4,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,GradientBoostingClassifier,0.008656,0.001192,0.000875,0.000197,"{'learning_rate': 1.0, 'n_estimators': 32}",0.0,0.5,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,AdaBoostClassifier,0.028422,0.002188,0.003181,0.000624,"{'learning_rate': 0.5, 'n_estimators': 32}",0.666667,0.8,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
9,MLPClassifier,0.023756,0.018484,0.000819,0.000129,"{'hidden_layer_sizes': 32, 'learning_rate_init...",0.333333,0.5,0.666667,0.0,...,0.232558,0.205128,0.142857,0.6,0.235294,0.222222,0.25,0.232558,0.25073,0.120316


In [20]:
#### select the best
clf = AdaBoostClassifier(n_estimators=32, learning_rate=1.0)

clf.fit(features_train, labels_train)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=32, random_state=None)

In [21]:
dump_classifier_and_data(clf, my_dataset, features_list)

## run tester

In [22]:
%run -i 'tester.py'

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=32, random_state=None)
	Accuracy: 0.85207	Precision: 0.43135	Recall: 0.34400	F1: 0.38275	F2: 0.35852
	Total predictions: 15000	True positives:  688	False positives:  907	False negatives: 1312	True negatives: 12093

