In [1]:
#!/usr/bin/python


import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### Load the dictionary containing the dataset
enron_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### Explore the data
## what are the features?
features_explore = enron_dict[enron_dict.keys()[0]].keys()
#print features_explore 

## How many features?
print len(features_explore) # 21 features

## How many data points, NaNs, POIs etc?



21




In [2]:
# Create eda data frame
import pandas as pd
import numpy as np

### The first feature must be "poi".
features_list = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', \
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', \
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', \
'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', \
'from_poi_to_this_person'] 
#, 'email_address' removed

### Task 2: Remove outliers
### Outliers

## Remove Outliers entry from dictionary

outlier_keys = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']
for key in outlier_keys:
        enron_dict.pop(key, 0)




In [3]:
### Task 3: Create new feature(s)


# add two new features

# poi_interaction == total number of emails to and from a POI / 
#the total number of emails sent or received. 
#def add_financial_aggregate(my_dataset, my_feature_list)

# define email_fields
email_fields = ['to_messages', 'from_messages',
          'from_poi_to_this_person', 'from_this_person_to_poi']
# loop through data dict
for record in enron_dict:
    # set person as a value of a record 
    person = enron_dict[record]
    # new variable
    is_valid = True
    # loop through email fields
    for field in email_fields:
        # check if field value in enron_dict is NaN
        if person[field] == 'NaN':
            # set is_valid as false
            is_valid = False
    # check for when is_valid is true        
    if is_valid:
        # calculate field value
        total_messages = person['to_messages'] +\
                         person['from_messages']
        poi_messages = person['from_poi_to_this_person'] +\
                       person['from_this_person_to_poi']
        person['poi_interaction'] = float(poi_messages) / total_messages
    else:
        # set value as NaN
        person['poi_interaction'] = 'NaN'
# add to dictionary        
features_list += ['poi_interaction']

# remuneration == total payments + total stock value 
# to capture total wealth
#def add_financial_aggregate(data_dict, features_list):
    
financial_fields = ['total_payments', 'total_stock_value']
for record in enron_dict:
    person = enron_dict[record]
    is_valid = True
    for field in financial_fields:
        if person[field] == 'NaN':
            is_valid = False
    if is_valid:
        # sum every field defined in financial_fields
        person['remuneration'] = sum([person[field] for field in financial_fields])
    else:
        person['remuneration'] = 'NaN'
features_list += ['remuneration']

print features_list

print len(features_list)

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person', 'poi_interaction', 'remuneration']
22


In [4]:
### Store to my_dataset for easy export below.
my_dataset = enron_dict
my_feature_list = features_list

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, my_feature_list)
labels, features = targetFeatureSplit(data)
#print labels, features


In [6]:

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# parameter optimisation of algorithm using GridSearchCV

#trials = 1

# create score function
from sklearn.metrics import recall_score, precision_score, f1_score


from numpy import mean
import numpy as np
from sklearn import model_selection
from sklearn import metrics

def scorer(estimator, features_test, labels_test):
    labels_pred = estimator.predict(features_test)
    pre= precision_score(labels_test, labels_pred, average='micro')
    rec = recall_score(labels_test, labels_pred, average='micro')
    if pre>0.3 and rec>0.3:
        return f1_score(labels_test, labels_pred, average='macro')
    elif  pre>0.3 and rec<0.3:
        return 0.3
    elif rec >0.3 and pre<0.3:
        return 0.3
    return 0

In [7]:
## Create Cross Validation object for use in GridSearchCV  
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
sss = model_selection.StratifiedShuffleSplit(30, random_state = 42)

## State and Fit the model
#param_grid = [  {'n_estimators': [50,80,100]
#                 , 'learning_rate': [0.5,1,1.5,2]#,3,4,5]
#                 ,'base_estimator':[DecisionTreeClassifier(min_samples_split=2)
#                                    , DecisionTreeClassifier(min_samples_split=3)
#                                    , DecisionTreeClassifier(min_samples_split=4)]
#                }]

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest,chi2,f_classif
ada_clf = AdaBoostClassifier()
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
kBest = SelectKBest()

pipe = Pipeline([('scaler', min_max_scaler),
                ('KBest', kBest ),
                ('ada_clf', ada_clf)]
                )
param_grid = {'KBest__k':range(4,11),'KBest__score_func':[chi2,f_classif]\
              ,'ada_clf__n_estimators':[50,80,100], 'ada_clf__learning_rate':[0.5,1.0,1.5,2]\
                     ,'ada_clf__base_estimator':[DecisionTreeClassifier(min_samples_split=2)\
                                                , DecisionTreeClassifier(min_samples_split=3)\
                                                , DecisionTreeClassifier(min_samples_split=4)]\
             }



my_clf = model_selection.GridSearchCV(pipe, param_grid, scoring = scorer, cv = sss, verbose=1, n_jobs=-1)
my_clf.fit(features, labels) 
print my_clf.best_estimator_

Fitting 30 folds for each of 504 candidates, totalling 15120 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 1041 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done 1832 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3016 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4108 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 5499 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 7327 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 8725 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 10759 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 13283 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done 15120 out of 15120 | elapsed: 13.5min finished


Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('KBest', SelectKBest(k=4, score_func=<function chi2 at 0x11303bf50>)), ('ada_clf', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
         ...om_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=100, random_state=None))])


In [8]:
 my_clf.best_estimator_.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('KBest', SelectKBest(k=4, score_func=<function chi2 at 0x11303bf50>)),
 ('ada_clf', AdaBoostClassifier(algorithm='SAMME.R',
            base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=3, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
            learning_rate=1.5, n_estimators=100, random_state=None))]

In [9]:
my_clf.best_estimator_.steps[1][1].scores_


array([  3.05278674e+00,   4.36397769e-01,   6.06966069e-02,
         2.78477884e+00,   6.84550934e+00,   5.12075414e+00,
         5.89535349e-01,   2.43221987e+00,   3.50676503e-03,
         5.47661010e+00,   1.48610337e+00,   6.68878174e+00,
         6.87385422e-02,   1.71595053e+00,   1.00080764e+00,
         1.50113085e+00,   3.40099218e-01,   2.53848503e+00,
         1.37005929e+00,   1.49803296e+00,   4.03774956e+00])

In [13]:
scores = my_clf.best_estimator_.steps[1][1].scores_
unsorted_pairs = zip(my_feature_list[1:], scores)
sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
sorted_pairs

[('exercised_stock_options', 6.8455093350345679),
 ('loan_advances', 6.6887817383422217),
 ('total_stock_value', 5.4766100992860336),
 ('bonus', 5.1207541370868066),
 ('remuneration', 4.0377495619033237),
 ('salary', 3.0527867447897892),
 ('total_payments', 2.7847788396505124),
 ('long_term_incentive', 2.5384850330808857),
 ('shared_receipt_with_poi', 2.4322198651432254),
 ('other', 1.7159505307994585),
 ('director_fees', 1.5011308535948469),
 ('poi_interaction', 1.4980329632584271),
 ('expenses', 1.486103366663613),
 ('from_poi_to_this_person', 1.3700592922294679),
 ('from_this_person_to_poi', 1.0008076418017084),
 ('restricted_stock', 0.58953534948658115),
 ('to_messages', 0.43639776880238534),
 ('deferred_income', 0.34009921840595764),
 ('from_messages', 0.068738542151315835),
 ('deferral_payments', 0.060696606931362404),
 ('restricted_stock_deferred', 0.0035067650332055236)]

In [12]:
my_feature_list[1:]

['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'from_poi_to_this_person',
 'poi_interaction',
 'remuneration']

In [14]:

import tester
from tester import test_classifier
from sklearn.ensemble import AdaBoostClassifier
#ada_clf = AdaBoostClassifier()
#test_clf = AdaBoostClassifier(learning_rate=1.5, n_estimators=50
                             # ,base_estimator=DecisionTreeClassifier(min_samples_split=3,class_weight='balanced') )

tester.test_classifier(my_clf.best_estimator_, my_dataset, my_feature_list)


Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('KBest', SelectKBest(k=4, score_func=<function chi2 at 0x11303bf50>)), ('ada_clf', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
         ...om_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=100, random_state=None))])
	Accuracy: 0.84707	Precision: 0.41985	Recall: 0.38500	F1: 0.40167	F2: 0.39150
	Total predictions: 15000	True positives:  770	False positives: 1064	False negatives: 1230	True negatives: 11936



In [32]:
my_clf.grid_scores_



[mean: 0.59776, std: 0.14872, params: {'ada_clf__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'), 'KBest__k': 4, 'ada_clf__learning_rate': 0.5, 'ada_clf__n_estimators': 50, 'KBest__score_func': <function chi2 at 0x113049050>},
 mean: 0.57640, std: 0.14685, params: {'ada_clf__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'), 'KBest__k': 4, 'ada_clf__learning_rate': 0.5, 'ada_clf__n_estimators': 80, 'KBest__score_func': <function chi2 at

In [70]:
from tester import dump_classifier_and_data

### Final and parameter tuned algorithm
#clf = my_clf.best_estimator_
clf = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.5, n_estimators=100, random_state=None)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, my_feature_list)


# https://www.kaggle.com/forums/f/15/kaggle-forum/t/4092/how-to-tune-rf-parameters-in-practice
# /Users/andreasvoniatismbp10/Documents/Dropbox/ARTIOS/Training/git_github/version-control/ud120-projects/final_project
# https://discussions.udacity.com/t/showing-the-best-estimator-from-grid-search-cv/194547/5

In [71]:
my_feature_list

['poi',
 'salary',
 'total_payments',
 'bonus',
 'shared_receipt_with_poi',
 'exercised_stock_options',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'long_term_incentive',
 'expenses',
 'restricted_stock']

In [73]:
len(my_dataset)

143