In [1]:
## Import all necessary libraries for manipulating data and building the ML models 

#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [2]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi". Added is one new "bonus_salary_multiple"

features_list = ['poi','salary', 'deferral_payments', 'total_payments', 
                 'loan_advances', 'bonus', 'restricted_stock_deferred', 
                 'deferred_income', 'total_stock_value', 'expenses', 
                 'exercised_stock_options', 'other', 'long_term_incentive', 
                 'restricted_stock', 'director_fees','to_messages', 
                  'from_poi_to_this_person', 'from_messages',
                 'from_this_person_to_poi', 'shared_receipt_with_poi', 'bonus_salary_multiple']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
    


In [3]:
### Investigate dataset and count number of NaN entries

print 'Total Number of records in dataset:' 
print len(data_dict)

print 'Total Number of columns in dataset:' 
print len(data_dict['METTS MARK'])


count = 0
for employee in data_dict:
    if data_dict[employee]['poi'] == True:
        count += 1
print 'Total Number of Poi\'s in dataset:'
print count

nan_features = features_list
nan_features.remove('bonus_salary_multiple')
nan_count = 0
for employee in data_dict:
    for feature in nan_features:
        if data_dict[employee][feature] == 'NaN':
         nan_count += 1

print 'Total Number of NaN entries in dataset:'
print nan_count

Total Number of records in dataset:
146
Total Number of columns in dataset:
21
Total Number of Poi's in dataset:
18
Total Number of NaN entries in dataset:
1323


In [4]:
### Task 2: Remove outliers. From viewing the relationship between Salary and Bonus "Total" identified and removed.
### Visual inspection of the dataset furthermore revealed a non-person "The Travel Agency in the Park" also removed.

features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

data_dict.pop('TOTAL',0)
data_dict.pop('The Travel Agency In the Park',0)

0

In [5]:
### Task 3: Create new feature(s) New feature "bonus_salary_multiple" added to dataset and feature list.
### Store to my_dataset for easy export below.


for employee in data_dict:
    if data_dict[employee]['salary'] > 0 and data_dict[employee]['bonus'] > 0:
        data_dict[employee]['bonus_salary_multiple'] = round(float(data_dict[employee]['bonus'])
                                                             /float(data_dict[employee]['salary']),2)
    else:
        data_dict[employee]['bonus_salary_multiple'] = float(0.0)
        
for employee in data_dict:
    for i in features_list:
        if data_dict[employee][i] == 'NaN' or data_dict[employee][i] == 'nan':
            data_dict[employee][i] = 0.0


my_dataset = data_dict

In [6]:
### Features reduced via SelectKBest printed including scoring per feature
### For reasons I have not been able to identify the new Feature was needed to be run twice (therefore duplicated here)
### In order for SelectKBest to run successfully

for employee in data_dict:
    if data_dict[employee]['salary'] > 0 and data_dict[employee]['bonus'] > 0:
        data_dict[employee]['bonus_salary_multiple'] = round(float(data_dict[employee]['bonus'])
                                                             /float(data_dict[employee]['salary']),2)
    else:
        data_dict[employee]['bonus_salary_multiple'] = float(0.0)


for employee in data_dict:
    for i in features_list:
        if data_dict[employee][i] == 'NaN' or data_dict[employee][i] == 'nan':
            data_dict[employee][i] = 0.0


my_dataset = data_dict


data = featureFormat(data_dict, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

kbest = SelectKBest()
kbest = SelectKBest(k='all')
selected_features = kbest.fit_transform(features,labels)


features_selected=[features_list[i+1] for i in kbest.get_support(indices=True)]
print 'Features selected by SelectKBest:'
### print features_selected

### print kbest.scores_

kbest_lst = []
for i in kbest.scores_:
    kbest_lst.append(i)

kbest_dict = dict(zip(features_selected, kbest_lst))

kbest_lst = sorted(kbest_lst, reverse=True)

import operator
sorted_kbest_dict = sorted(kbest_dict.items(), key=operator.itemgetter(1), reverse=True)


matplotlib.pyplot.plot(kbest_lst)
matplotlib.pyplot.xlabel("Feature")
matplotlib.pyplot.ylabel("kScore")
matplotlib.pyplot.show()


print sorted_kbest_dict

Features selected by SelectKBest:
[('exercised_stock_options', 25.097541528735491), ('total_stock_value', 24.467654047526398), ('bonus', 21.060001707536571), ('salary', 18.575703268041785), ('deferred_income', 11.595547659730601), ('long_term_incentive', 10.072454529369441), ('restricted_stock', 9.3467007910514877), ('total_payments', 8.8667215371077717), ('shared_receipt_with_poi', 8.7464855321290802), ('loan_advances', 7.2427303965360181), ('expenses', 6.2342011405067401), ('from_poi_to_this_person', 5.3449415231473374), ('other', 4.204970858301416), ('from_this_person_to_poi', 2.4265081272428781), ('director_fees', 2.1076559432760908), ('to_messages', 1.6988243485808501), ('deferral_payments', 0.2170589303395084), ('from_messages', 0.16416449823428736), ('restricted_stock_deferred', 0.06498431172371151)]


In [7]:
### From analysis of kBest the 4 highest scoring features are chosen going forward
### Features list updated with the SelectKBest and 'poi' added as first element

kbest = SelectKBest(k=6)
selected_features = kbest.fit_transform(features,labels)


features_selected=[features_list[i+1] for i in kbest.get_support(indices=True)]
print 'Features selected by SelectKBest:'
print features_selected



poi = 'poi'
features_list = [poi] + features_selected
print features_list

Features selected by SelectKBest:
['salary', 'bonus', 'deferred_income', 'total_stock_value', 'exercised_stock_options', 'long_term_incentive']
['poi', 'salary', 'bonus', 'deferred_income', 'total_stock_value', 'exercised_stock_options', 'long_term_incentive']


In [8]:
### Extract features and labels from dataset for local testing
### data = featureFormat(my_dataset, features_list, sort_keys = True)
### labels, features = targetFeatureSplit(data)
### features scaled to between 0-1 via the MinMaxScaler


data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)


### For the initial test of chosen ML Algorithms features/labels split into train/test sets

labels_train, labels_test, features_train, features_test = train_test_split(labels, features, test_size = 0.3, random_state = 42)


In [9]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# GaussianNB classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
gaus_sc = clf.score(features_test, labels_test)

print 'GaussianNB Validation'
print "Score: ", gaus_sc
pre_gaus = precision_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Presicion: ",pre_gaus
rec_gaus = recall_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Recall: ",rec_gaus, '\n'


# Support Vector Machine classifier
from sklearn.svm import SVC
clf = SVC()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
svm_sc = clf.score(features_test, labels_test)

print 'SVM Validation'
print "Score: ", svm_sc
pre_svm = precision_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Presicion: ",pre_svm
rec_svm = recall_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Recall: ",rec_svm, '\n'


# RandomForest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 0)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
rf_sc = clf.score(features_test, labels_test)

print 'RandomForrest Validation'
print "Score: ", rf_sc
pre_rf = precision_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Presicion: ",pre_rf
rec_rf = recall_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Recall: ",rec_rf, '\n'


GaussianNB Validation
Score:  0.928571428571
Presicion:  0.5
Recall:  0.666666666667 

SVM Validation
Score:  0.928571428571
Presicion:  0.0
Recall:  0.0 

RandomForrest Validation
Score:  0.904761904762
Presicion:  0.428571428571
Recall:  1.0 



  'precision', 'predicted', average, warn_for)


In [10]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Further libraries loaded to do Pipeline, FeatureUnion, Gridsearch, PCA and StratifiedShufflesplit

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.decomposition import PCA

# Validation of the "Tuned" Classifiers done via a StratifiedShuffleSplit  
ss_split = StratifiedShuffleSplit(labels, 100, test_size = 0.5, random_state = 42)

for train_idx, test_idx in ss_split: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )    




In [11]:
### RandomForest with Parameter Tuning

clf = RandomForestClassifier(criterion = 'gini', min_samples_split = 2, max_leaf_nodes = None, max_depth = 5, 
                             min_samples_leaf = 2, random_state = 0, n_estimators = 200)

parameters = {'max_features': [1,2,3,4]}

grid_search_rf = GridSearchCV(clf, parameters, cv = ss_split)
grid_search_rf.fit(features, labels)

clf = grid_search_rf.best_estimator_

clf.fit(features, labels)

print 'RandomForest Validation with Parameter Tuning'
print ('Best Score: {}'.format(grid_search_rf.best_score_))
print('Best parameters: {}'.format(grid_search_rf.best_params_))
print clf

pred_rf= clf.predict(features_test)

pre_rf = precision_score(labels_test, pred_rf, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Precision: ",pre_rf
rec_rf = recall_score(labels_test, pred_rf, labels=None, pos_label=1, average='binary', sample_weight=None)
print "Recall: ",rec_rf


RandomForest Validation with Parameter Tuning
Best Score: 0.877428571429
Best parameters: {'max_features': 1}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=1, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Precision:  0.8
Recall:  0.444444444444


In [12]:
### Support Vector Machine with Parameter Tuning


from sklearn.decomposition import PCA


pipeline = Pipeline([('scaling',scaler),('reduce_dim', PCA()), ("SupportVectorClassifier", SVC(random_state=42))])


param_grid = {
          'SupportVectorClassifier__C': [1, 10, 100, 1000], 
          'SupportVectorClassifier__gamma': [0.001, 0.0001], 
          'SupportVectorClassifier__kernel': ['rbf', 'linear'],
    'reduce_dim__n_components':[1, 2, 3, 4, 5, 6]
}


# pa = dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6])


gs = GridSearchCV(pipeline, param_grid = param_grid, scoring = 'f1', cv = ss_split)
gs.fit(features, labels)

clf = gs.best_estimator_

clf.fit(features, labels)
pred = clf.predict(features_test)

accuracy = clf.score(features_test, labels_test)
print "accuracy SVC",accuracy

pre = precision_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "pre",pre
rec = recall_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "rec",rec

print clf


accuracy SVC 0.9
pre 1.0
rec 0.222222222222
Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reduce_dim', PCA(copy=True, n_components=6, whiten=False)), ('SupportVectorClassifier', SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='linear', max_iter=-1, probability=False,
  random_state=42, shrinking=True, tol=0.001, verbose=False))])


  'precision', 'predicted', average, warn_for)


In [13]:
### GaussianNB with Parameter Tuning

clf = GaussianNB()

from sklearn.decomposition import PCA

pipeline = Pipeline([('scaling',scaler),('reduce_dim', PCA()),("NaiveBayes", GaussianNB())])

pa = dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6])

gs = GridSearchCV(pipeline, param_grid = pa, scoring = 'f1', cv = ss_split)
gs.fit(features, labels)

clf = gs.best_estimator_

clf.fit(features, labels)
pred = clf.predict(features_test)

accuracy = clf.score(features_test, labels_test)
print "accuracy GaussianNB",accuracy

pre = precision_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "pre",pre
rec = recall_score(labels_test, pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print "rec",rec

print clf


accuracy GaussianNB 0.871428571429
pre 0.5
rec 0.444444444444
Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reduce_dim', PCA(copy=True, n_components=3, whiten=False)), ('NaiveBayes', GaussianNB())])


In [14]:
    
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)