In [54]:
#!/usr/bin/python


import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### Load the dictionary containing the dataset
enron_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### Explore the data
## what are the features?
features_explore = enron_dict[enron_dict.keys()[0]].keys()
#print features_explore 

## How many features?
print len(features_explore) # 21 features

## How many data points, NaNs, POIs etc?



21


In [55]:
# Create eda data frame
import pandas as pd
import numpy as np

### The first feature must be "poi".
features_list = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', \
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', \
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', \
'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', \
'from_poi_to_this_person'] 
#, 'email_address' removed

### Task 2: Remove outliers
### Outliers

## Remove Outliers entry from dictionary

outlier_keys = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']
for key in outlier_keys:
        enron_dict.pop(key, 0)




In [56]:
### Task 3: Create new feature(s)


# add two new features

# poi_interaction == total number of emails to and from a POI / 
#the total number of emails sent or received. 
#def add_financial_aggregate(my_dataset, my_feature_list)

# define email_fields
email_fields = ['to_messages', 'from_messages',
          'from_poi_to_this_person', 'from_this_person_to_poi']
# loop through data dict
for record in enron_dict:
    # set person as a value of a record 
    person = enron_dict[record]
    # new variable
    is_valid = True
    # loop through email fields
    for field in email_fields:
        # check if field value in enron_dict is NaN
        if person[field] == 'NaN':
            # set is_valid as false
            is_valid = False
    # check for when is_valid is true        
    if is_valid:
        # calculate field value
        total_messages = person['to_messages'] +\
                         person['from_messages']
        poi_messages = person['from_poi_to_this_person'] +\
                       person['from_this_person_to_poi']
        person['poi_interaction'] = float(poi_messages) / total_messages
    else:
        # set value as NaN
        person['poi_interaction'] = 'NaN'
# add to dictionary        
features_list += ['poi_interaction']

# remuneration == total payments + total stock value 
# to capture total wealth
#def add_financial_aggregate(data_dict, features_list):
    
financial_fields = ['total_payments', 'total_stock_value']
for record in enron_dict:
    person = enron_dict[record]
    is_valid = True
    for field in financial_fields:
        if person[field] == 'NaN':
            is_valid = False
    if is_valid:
        # sum every field defined in financial_fields
        person['remuneration'] = sum([person[field] for field in financial_fields])
    else:
        person['remuneration'] = 'NaN'
features_list += ['remuneration']

print features_list

print len(features_list)

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person', 'poi_interaction', 'remuneration']
22


In [57]:
#Prior to training the machine learning algorithm classifiers, 
#I scaled all features using a min-max scaler. 
#This was vitally important, as the features had different units 
#(e.g. # of email messages and USD) and varied significantly by several orders of magnitude. 
#Feature-scaling ensured that for the applicable classifiers, the features would be weighted evenly.

### Store to my_dataset for easy export below.
my_dataset = enron_dict
my_feature_list = features_list

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, my_feature_list)
labels, features = targetFeatureSplit(data)
#print labels, features

# Apply min-max feature scaling
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)


## Apply PCA to pre-process features to be simplified
#from sklearn import decomposition
#
##n_components = 21
#perc_var = .95
#pca = decomposition.PCA(n_components = perc_var, whiten = True).fit(features)
#features = pca.transform(features)
##print {"Explained Variance Ratio": pca.explained_variance_ratio_}



In [58]:
# Use Decision Tree for feature selection and update features
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(features, labels)
tree_scores = zip(features_list[1:], clf.feature_importances_)
tree_dict = {}
i = 0
for line in tree_scores:
    feat_, score_ = tree_scores[i]
    tree_dict[feat_] = score_
    i += 1
    if score_ == 0.0:
        print feat_, score_
        del tree_dict[feat_]
print tree_dict

# Update feature list
target_label = 'poi'
features_list = [target_label] + tree_dict.keys()
print features_list




to_messages 0.0
deferral_payments 0.0
restricted_stock_deferred 0.0
total_stock_value 0.0
loan_advances 0.0
director_fees 0.0
deferred_income 0.0
from_poi_to_this_person 0.0
poi_interaction 0.0
remuneration 0.0
{'salary': 0.052962962962962962, 'total_payments': 0.14476543209876541, 'bonus': 0.1091318137409264, 'shared_receipt_with_poi': 0.057777777777777775, 'exercised_stock_options': 0.19984012789768188, 'from_messages': 0.10375925925925925, 'other': 0.030023449025545469, 'from_this_person_to_poi': 0.0073448637316560501, 'long_term_incentive': 0.10895238095238094, 'expenses': 0.064896989949621606, 'restricted_stock': 0.12054494260342233}
['poi', 'salary', 'total_payments', 'bonus', 'shared_receipt_with_poi', 'exercised_stock_options', 'from_messages', 'other', 'from_this_person_to_poi', 'long_term_incentive', 'expenses', 'restricted_stock']


In [59]:
### Store to my_dataset for easy export below.
my_dataset = enron_dict
my_feature_list = features_list

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, my_feature_list)
labels, features = targetFeatureSplit(data)
#print labels, features


In [60]:
features_list

['poi',
 'salary',
 'total_payments',
 'bonus',
 'shared_receipt_with_poi',
 'exercised_stock_options',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'long_term_incentive',
 'expenses',
 'restricted_stock']

In [None]:

'''
# get K-best features
from sklearn import feature_selection

def get_k_best(data_dict, features_list, k):
    # runs scikit-learn's SelectKBest feature selection
    # returns dict where keys=features, values=scores
    data = featureFormat(enron_dict, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = feature_selection.SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])

    return k_best_features

num_features = len(tree_dict)
best_features = get_k_best(my_dataset, my_feature_list, num_features)
print best_features
my_feature_list = [target_label] + best_features.keys()

#print my_feature_list
'''



In [28]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


### Logistic Regression Classifier
from sklearn import linear_model
log_clf = linear_model.LogisticRegression()

### Stochastic Gradient Descent - Logistic Regression
sgd_clf = linear_model.SGDClassifier(loss='log')

### Random Forest
from sklearn import ensemble
rf_clf = ensemble.RandomForestClassifier()

### Decision Tree
dt_clf = tree.DecisionTreeClassifier()

### Adaboost
#ada_clf = ensemble.AdaBoostClassifier(learning_rate=5)


from numpy import mean
import numpy as np
from sklearn import model_selection
from sklearn import metrics


def evaluate_clf(clf, features, labels, trials = 500, test_size = 0.4, random_state = 42):
    print clf
    accuracy = []
    precision = []
    recall = []
    first = True
    for trial in range(trials):
        features_train, features_test, labels_train, labels_test = \
            model_selection.train_test_split(features, labels, test_size = test_size, random_state = random_state)
        # fit the data
        clf.fit(features_train, labels_train)

        predictions = clf.predict(features_test)
        accuracy.append(metrics.accuracy_score(predictions,labels_test))
        precision.append(metrics.precision_score(predictions, labels_test))
        recall.append(metrics.recall_score(predictions, labels_test))
        if (trial % 10 == 0):
            if first:
                sys.stdout.write('\nProcessing')
            sys.stdout.write('.')
            sys.stdout.flush()
            first = False    
    print "complete.\n"
    print "accuracy: {}".format(np.mean(accuracy))
    print "precision: {}".format(np.mean(precision))
    print "recall:    {}".format(np.mean(recall))    
    return (np.mean(accuracy), np.mean(precision), np.mean(recall))

    

#evaluate_clf(log_clf, features, labels)
#evaluate_clf(sgd_clf, features, labels)
#evaluate_clf(rf_clf, features, labels)
#evaluate_clf(dt_clf, features, labels)
evaluate_clf(ada_clf, features, labels)



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=5,
          n_estimators=50, random_state=None)

Processing..................................................complete.

accuracy: 0.120689655172
precision: 1.0
recall:    0.120689655172


(0.12068965517241378, 1.0, 0.12068965517241378)

In [61]:

import tester
from tester import test_classifier
ada_clf = ensemble.AdaBoostClassifier()

#tester.test_classifier(sgd_clf, my_dataset, features_list)
#tester.test_classifier(log_clf, my_dataset, features_list)
#tester.test_classifier(rf_clf, my_dataset, features_list)
#tester.test_classifier(dt_clf, my_dataset, features_list)
tester.test_classifier(ada_clf, my_dataset, features_list)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
	Accuracy: 0.83967	Precision: 0.36563	Recall: 0.27550	F1: 0.31423	F2: 0.28979
	Total predictions: 15000	True positives:  551	False positives:  956	False negatives: 1449	True negatives: 12044



In [63]:

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# parameter optimisation of algorithm using GridSearchCV

#trials = 1

# create score function
from sklearn.metrics import recall_score, precision_score, f1_score
def scorer_r_p(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)
    r_score = recall_score(y_test, y_pred)
    p_score = precision_score(y_test, y_pred)
#    f1_score = f1_score(y_test, y_pred)
    if r_score<0.3 or p_score<0.3:
        return 0
    return  f1_score(y_test, y_pred)



## Create Cross Validation object for use in GridSearchCV  
sss = model_selection.StratifiedShuffleSplit(50, random_state = 42)

## State and Fit the model
param_grid = [  {'n_estimators': [50,80,100], 'learning_rate': [0.5,1,1.5,2,3,4,5]}]


my_clf = model_selection.GridSearchCV(ada_clf, param_grid, scoring = scorer_r_p, cv = sss, verbose=1, n_jobs=-1)
my_clf.fit(features, labels) 
print my_clf.best_estimator_




Fitting 50 folds for each of 21 candidates, totalling 1050 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.5min


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.5, n_estimators=50, random_state=None)


[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed:  2.0min finished


In [64]:
from tester import dump_classifier_and_data

### Final and parameter tuned algorithm
clf = my_clf.best_estimator_


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, my_feature_list)


# https://www.kaggle.com/forums/f/15/kaggle-forum/t/4092/how-to-tune-rf-parameters-in-practice
# /Users/andreasvoniatismbp10/Documents/Dropbox/ARTIOS/Training/git_github/version-control/ud120-projects/final_project
# https://discussions.udacity.com/t/showing-the-best-estimator-from-grid-search-cv/194547/5