In [1]:
# %load poi_id.py
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

import pandas as pd

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, main

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

# director_fees and loan_advances, stocks
features_list = ['poi','salary', 'bonus', 'long_term_incentive', 'deferred_income',
                 'deferral_payments', 'other', 'expenses','salary_bonus_ratio',
                 'total_payments', 'exercised_stock_options', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi','shared_receipt_with_poi'] # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
    data_dict.pop('TOTAL', 0)
    data_dict.pop("THE TRAVEL AGENCY IN THE PARK", 0 )
    data_dict.pop("LOCKHART EUGENE E", 0 )
    
#convert to dataframe and remove outliers
dataDF = pd.DataFrame.from_dict(data_dict, orient = 'index' ,
                  dtype = float)
dataDF['salary_bonus_ratio'] = dataDF['bonus'] / dataDF['salary']
dataDF = dataDF.replace(['NaN','inf'], 0)
dataDF = dataDF[dataDF['restricted_stock_deferred'] <= 0]

data_dict = dataDF.to_dict(orient = 'index')
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
#features_train, features_test, labels_train, labels_test = \
#    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.



In [2]:
nb = GaussianNB()
skb = SelectKBest(f_classif)
pca = PCA()

pipeline = Pipeline(steps=[("SKB",skb),("PCA", pca), ("NB", nb)])

params = {'SKB__k':range(2,18),'PCA__n_components':range(1,3)}

split = StratifiedShuffleSplit(labels, test_size=0.3, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'f1')
    
gs.fit(features,labels)
clf=gs.best_estimator_
print clf

dump_classifier_and_data(clf, my_dataset, features_list)
main()

Pipeline(steps=[('SKB', SelectKBest(k=8, score_func=<function f_classif at 0x000000000AF6A908>)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('NB', GaussianNB(priors=None))])
Pipeline(steps=[('SKB', SelectKBest(k=8, score_func=<function f_classif at 0x000000000AF6A908>)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('NB', GaussianNB(priors=None))])
	Accuracy: 0.87840	Precision: 0.60654	Recall: 0.25050	F1: 0.35456	F2: 0.28382
	Total predictions: 15000	True positives:  501	False positives:  325	False negatives: 1499	True negatives: 12675



In [3]:
#scaler = MinMaxScaler()
skb = SelectKBest(f_classif)
pca = PCA()
dtc = DecisionTreeClassifier(random_state = 100)

#pipeline = Pipeline(steps=[('Scaling',scaler),("PCA", pca), ("Tree", dtc)])
pipeline = Pipeline(steps=[("SKB",skb),("PCA", pca), ("Tree", dtc)])

params = {'SKB__k':range(3,10),'PCA__n_components':range(1,4),
          'Tree__min_samples_split': range(5,20),'Tree__criterion':['gini','entropy']}

split = StratifiedShuffleSplit(labels, test_size=0.3, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'f1')
    
gs.fit(features,labels)
clf=gs.best_estimator_
print clf

dump_classifier_and_data(clf, my_dataset, features_list)
main()

  'precision', 'predicted', average, warn_for)


Pipeline(steps=[('SKB', SelectKBest(k=5, score_func=<function f_classif at 0x000000000AF6A908>)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,...it=12, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best'))])
Pipeline(steps=[('SKB', SelectKBest(k=5, score_func=<function f_classif at 0x000000000AF6A908>)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,...it=12, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best'))])
	Accuracy: 0.84980	Precision: 0.42572	Recall: 0.36250	F1: 0.39157	F2: 0.37360
	Total predictions: 15000	True positives:  725	False positives:  978	False negatives: 1275	Tru

In [4]:
X_new = gs.best_estimator_.named_steps['SKB']
feature_scores = ['%.2f' % elem for elem in X_new.scores_ ]
feature_scores_pvalues = ['%.3f' % elem for elem in  X_new.pvalues_ ]
features_selected_tuple=[(features_list[i+1], feature_scores[i], feature_scores_pvalues[i]) for i in X_new.get_support(indices=True)]
features_selected_tuple = sorted(features_selected_tuple, key=lambda feature: float(feature[1]) , reverse=True)
print ' '
print 'Selected Features, Scores, P-Values'
print features_selected_tuple

 
Selected Features, Scores, P-Values
[('exercised_stock_options', '24.43', '0.000'), ('total_stock_value', '23.61', '0.000'), ('bonus', '20.26', '0.000'), ('salary', '17.72', '0.000'), ('deferred_income', '11.18', '0.001')]
