In [4]:
import sys
import pickle
import pandas
import numpy as np
sys.path.append("../tools/")

from sklearn.preprocessing import MinMaxScaler as MMS
from sklearn import feature_selection
from sklearn.feature_selection import chi2, f_classif, SelectKBest
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from pprint import pprint
import timeit
import numpy as np





from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
    
       
#convert from dict to df
df = pandas.DataFrame.from_dict(list(data_dict.values()),dtype="float64")
employees = pandas.Series(list(data_dict.keys()))
df.set_index(employees, inplace=True)        
df2 = df.sort_values(by='salary', ascending=0)       


#remove outliers
df2 = df2.drop({"LOCKHART EUGENE E"})
df2 = df2.drop(["TOTAL"])
df2 = df2.drop(["THE TRAVEL AGENCY IN THE PARK"])        
        
        
#create new features       
df2["Fromfract"] = df2.from_poi_to_this_person.divide(df2.from_messages, axis="index").fillna(0)
df2["Tofract"] = df2.from_this_person_to_poi.divide(df2.to_messages, axis="index").fillna(0)
df2["SaltoPay"] = df2.salary.divide(df2.total_payments, axis="index").fillna(0)
df2["ESVtoTSV"] = df2.exercised_stock_options.divide(df2.total_stock_value, axis="index").fillna(0)
df2["RStoTSV"] = df2.restricted_stock.divide(df2.total_stock_value, axis="index").fillna(0)


#conver float nan to string NaN, otherwise the featureformat function won't deal with it.
df2 = df2.replace(np.nan, "NaN", regex=True)

# create a dictionary from the dataframe
data_dict = df2.to_dict('index')

my_dataset = data_dict

#this is my new featurelist including the 5 new features I made
features_list = ['poi','salary', 'bonus', 'expenses', "total_payments", "exercised_stock_options",
"restricted_stock", "long_term_incentive", "deferral_payments", "deferred_income", "director_fees",
"other", "shared_receipt_with_poi", "total_payments", "total_stock_value", "Fromfract", "Tofract",
"SaltoPay", "ESVtoTSV", "RStoTSV"]

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)



###
#create default featurelist, datadict for comparison
default_features_list = ['poi','salary', 'bonus', 'expenses', "total_payments", "exercised_stock_options",
"restricted_stock", "long_term_incentive", "deferral_payments", "deferred_income", "director_fees",
"other", "shared_receipt_with_poi", "total_payments", "total_stock_value"]
#passing in the above list means that the new features are ignored
default_data =featureFormat(my_dataset, default_features_list, sort_keys = True)
#default features, default labels
dlabels, dfeatures = targetFeatureSplit(default_data)
dX_train, dX_test, dy_train, dy_test = train_test_split(dfeatures, dlabels, test_size=0.3, random_state=42)

#Investigate most important features using SSS to ensure robust importance results.
#Compare the results for both the default feature list and the feature list I created with my new ratios.
#repurposed some of the code from tester.py 
from sklearn.cross_validation import StratifiedShuffleSplit

#Cross validate and access Feature Importances on decisiontreeclassifier with 100 folds
clf = DecisionTreeClassifier(random_state=42)

cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

FeatureValue = np.zeros(len(features_list)-1)
featrscore = np.zeros(len(features_list)-1)
featpscore = np.zeros(len(features_list)-1)

for train_idx, test_idx in cv: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )

    fit1 = clf.fit(features_train, labels_train)
        #add values to list
    FeatureValue += fit1.feature_importances_  
    pred = clf.predict(features_test)
    featrscore += recall_score(pred, labels_test)  
    featpscore += precision_score(pred, labels_test)
        
clf2 = DecisionTreeClassifier(random_state=42)       


FeatureValueDefault = np.zeros(len(default_features_list)-1)
featrscored = np.zeros(len(features_list)-1)
featpscored = np.zeros(len(features_list)-1)


for train_idx, test_idx in cv: 
    dfeatures_train = []
    dfeatures_test  = []
    dlabels_train   = []
    dlabels_test    = []
    for ii in train_idx:
        dfeatures_train.append( dfeatures[ii] )
        dlabels_train.append( dlabels[ii] )
    for jj in test_idx:
        dfeatures_test.append( dfeatures[jj] )
        dlabels_test.append( dlabels[jj] )

    fit2 = clf2.fit(dfeatures_train, dlabels_train)
 
    FeatureValueDefault += fit2.feature_importances_
    pred2 = clf2.predict(dfeatures_test)
    featrscored += recall_score(pred2, dlabels_test)  
    featpscored += precision_score(pred2, dlabels_test)
    
    
    
#average over the 100 folds
FeatureValue = FeatureValue/100
featrscore = featrscore/100
featpscore = featpscore/100

FeatureValueDefault = FeatureValueDefault/100
featrscored = featrscored/100
featpscored = featpscored/100


print "New Featureset:"
print "{} Recall & {} Precision".format(featrscore[1], featpscore[1])


print "Old Featureset:"
print "{} Recall & {} Precision".format(featrscored[1], featpscored[1])

ranks1 = np.argsort(FeatureValue)[::-1]
print "All features for Updated Featurelist"
for n in range(len(features_list)-1):
    print "{} feature: {} ({})".format(n, features_list[n+1], FeatureValue[ranks1[n]])

    ranks2 = np.argsort(FeatureValueDefault)[::-1]
print "All features for Default Featurelist"
for n in range(len(default_features_list)-1):
    print "{} feature: {} ({})".format(n, default_features_list[n+1], FeatureValueDefault[ranks2[n]])  

       
        

New Featureset:
0.2525 Recall & 0.26 Precision
Old Featureset:
0.2075 Recall & 0.22 Precision
All features for Updated Featurelist
0 feature: salary (0.15157943172)
1 feature: bonus (0.132620172679)
2 feature: expenses (0.107681830387)
3 feature: total_payments (0.086944516033)
4 feature: exercised_stock_options (0.0850770882743)
5 feature: restricted_stock (0.0768341892247)
6 feature: long_term_incentive (0.0578261679248)
7 feature: deferral_payments (0.0528884929341)
8 feature: deferred_income (0.0491795048977)
9 feature: director_fees (0.0465893504185)
10 feature: other (0.032468398451)
11 feature: shared_receipt_with_poi (0.027776646776)
12 feature: total_payments (0.0264091310324)
13 feature: total_stock_value (0.0198253318306)
14 feature: Fromfract (0.0197769107643)
15 feature: Tofract (0.0134790405578)
16 feature: SaltoPay (0.00990093895183)
17 feature: ESVtoTSV (0.00314285714286)
18 feature: RStoTSV (0.0)
All features for Default Featurelist
0 feature: salary (0.175544188366)
1

In [None]:
#create scaler
MMSs = MMS()
#Create Transformers
SKBt = SelectKBest(k=6)
PCAt = PCA(random_state=42)
FUt = FeatureUnion([("kbest", SKBt), ("pca", PCAt)])
#Create Classifiers
GNBc = GaussianNB()
SVCc = SVC(kernel="linear", random_state=42)
DTCc = DecisionTreeClassifier(random_state=42)
RFc = RandomForestClassifier(random_state=42)

#create pipelines, one for each object. Note MMS included.
Pipe = Pipeline([("MMS", MMSs),
                ("SKB", SKBt),
                 ("clf", GNBc)])

Pipe2 = Pipeline([("MMS", MMSs),
                ("SKB", SKBt),
                 ("clf", SVCc)])

Pipe3 = Pipeline([("MMS", MMSs),
                ("SKB", SKBt),
                 ("clf", DTCc)])

Pipe4 = Pipeline([("MMS", MMSs),
                ("SKB", SKBt),
                 ("clf", RFc)])

#Test Grid goes here
prm_grid = dict(
                ) 
#####

cv_sss = StratifiedShuffleSplit(labels, 10, test_size=0.3, random_state=42)


#create a gridsearch object for each algorithim, using a different pipe
grid = GridSearchCV(Pipe, prm_grid, cv = cv_sss, scoring ="f1")
grid2 = GridSearchCV(Pipe2, prm_grid, cv = cv_sss, scoring ="f1")
grid3 = GridSearchCV(Pipe3, prm_grid, cv = cv_sss, scoring ="f1")
grid4 = GridSearchCV(Pipe4, prm_grid, cv = cv_sss, scoring ="f1")


#apply the grid to the data
start_time = timeit.default_timer()
grid.fit(features, labels)
print("with n_splits=10 done in  %r" % (timeit.default_timer() - start_time))
#print "Best Estimator:"
#print(grid.best_estimator_)
print "Best F1 Score:"
print(grid.best_score_)
print "Best parameters:"
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(prm_grid.keys()):
       print '\t%s: %r' % (param_name, best_parameters[param_name])
        
#from pprint import pprint
#pprint(grid4.cv_results_)


In [None]:
clf = grid.best_estimator_
#import test_classifier
from tester import test_classifier
print "Tester Classification Report"
test_classifier(clf, my_dataset, features_list)