In [1]:
#!/usr/bin/python

%matplotlib notebook
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import my_classifier_utils
import my_data_utils

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary','bonus', 'total_payments','exercised_stock_options','shared_receipt_with_poi','expenses',
                'email_to_poi_ratio', 'email_from_poi_ratio', 'exer_stock_ratio'] 
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)



In [2]:
### Task 2: Remove outliers
import my_data_utils

In [3]:
my_data_utils.maxPrinter(data_dict, 'salary')
print 'remove max'
my_data_utils.removeMax(data_dict, 'salary')
my_data_utils.maxPrinter(data_dict, 'salary')

Max salary TOTAL 26704229
remove max
Max salary SKILLING JEFFREY K 1111258


In [4]:
### Task 3: Create new feature(s)
#email_to_poi_ratio
#email_from_poi_ratio
for key in data_dict.keys():
    email_to = data_dict[key]['to_messages']
    to_poi = data_dict[key]['from_this_person_to_poi']
    email_from = data_dict[key]['from_messages']
    from_poi = data_dict[key]['from_poi_to_this_person']
    exer_stock_opt = data_dict[key]['exercised_stock_options']
    total_stock = data_dict[key]['total_stock_value']
    
    if 'NaN' not in (email_to, to_poi):
        data_dict[key]['email_to_poi_ratio'] = float(to_poi)/float(email_to)
    else:
        data_dict[key]['email_to_poi_ratio'] = 'NaN'
    if 'NaN' not in (email_from, from_poi):
        data_dict[key]['email_from_poi_ratio'] = float(from_poi)/float(email_from)
    else:
        data_dict[key]['email_from_poi_ratio'] = 'NaN'
    if 'NaN' not in (exer_stock_opt, total_stock):
        data_dict[key]['exer_stock_ratio'] = float(exer_stock_opt)/float(total_stock)
    else:
        data_dict[key]['exer_stock_ratio'] = 'NaN'

In [5]:

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


In [6]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)
print "Train:"
my_classifier_utils.count_true(labels_train)
my_classifier_utils.count_false(labels_train)
print "Test:"
my_classifier_utils.count_true(labels_test)
my_classifier_utils.count_false(labels_test)


Train:
True Count:14
False Count:85
Test:
True Count:4
False Count:39


39

In [7]:
from sklearn import tree
dtClf = tree.DecisionTreeClassifier()
dt_pred = my_classifier_utils.trainAndTestClassifier (dtClf, features_train, features_test, labels_train, labels_test)
print dt_pred

99 Training Points
training time: 0.0 s
predict time: 0.0 s
accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0

43 Test Points
predict time: 0.0 s
accuracy: 0.767441860465
10 Wrong ones: [(4, 0.0), (8, 0.0), (11, 1.0), (19, 0.0), (21, 0.0), (25, 0.0), (27, 0.0), (35, 1.0), (36, 1.0), (42, 0.0)]
1 True Positives: [22]
32 True Negatives: [0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 26, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41]
7 False Positives: [4, 8, 19, 21, 25, 27, 42]
3 False Negatives: [11, 35, 36]
Precision: 0.125
Recall: 0.25

[ 0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  1.  1.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.]


In [8]:
print 'DT Importance:'
for i in range (0,len(features_list)-1):
    print features_list[i+1], ":", dtClf.feature_importances_[i]

DT Importance:
salary : 0.0266068589598
bonus : 0.281368547419
total_payments : 0.0
exercised_stock_options : 0.17774771447
shared_receipt_with_poi : 0.0
expenses : 0.352600424785
email_to_poi_ratio : 0.0245335972227
email_from_poi_ratio : 0.0816806722689
exer_stock_ratio : 0.0554621848739


In [9]:
from sklearn.ensemble import AdaBoostClassifier
abcDT = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME")
abcDT_pred = my_classifier_utils.trainAndTestClassifier (abcDT, features_train, features_test, labels_train, labels_test)
print abcDT_pred

99 Training Points
training time: 0.09 s
predict time: 0.0 s
accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0

43 Test Points
predict time: 0.01 s
accuracy: 0.953488372093
2 Wrong ones: [(35, 1.0), (36, 1.0)]
2 True Positives: [11, 22]
39 True Negatives: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41, 42]
0 False Positives: []
2 False Negatives: [35, 36]
Precision: 1.0
Recall: 0.5

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


In [10]:
print 'AB Importance:'
for i in range (0,len(features_list)-1):
    print features_list[i+1], ":", abcDT.feature_importances_[i]

AB Importance:
salary : 0.140475207904
bonus : 0.114773504593
total_payments : 0.107482507215
exercised_stock_options : 0.164734472594
shared_receipt_with_poi : 0.028200531679
expenses : 0.247680005047
email_to_poi_ratio : 0.153334408414
email_from_poi_ratio : 0.0227344350073
exer_stock_ratio : 0.0205849275454


In [11]:
parameters = {'max_depth':[2,3,5,8,10,15], 'min_samples_split':[2,3,5], 'criterion' : ['gini','entropy']}
tempDTClf = tree.DecisionTreeClassifier()
from sklearn.model_selection import GridSearchCV
gridClf = GridSearchCV(tempDTClf, parameters, scoring='f1')
grid_pred = my_classifier_utils.trainAndTestClassifier (gridClf, features_train, features_test, labels_train, labels_test)
print grid_pred
print gridClf.best_params_

99 Training Points


  'precision', 'predicted', average, warn_for)


training time: 0.25 s
predict time: 0.0 s
accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0

43 Test Points
predict time: 0.0 s
accuracy: 0.790697674419
9 Wrong ones: [(4, 0.0), (11, 1.0), (19, 0.0), (21, 0.0), (25, 0.0), (27, 0.0), (35, 1.0), (36, 1.0), (42, 0.0)]
1 True Positives: [22]
33 True Negatives: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 26, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41]
6 False Positives: [4, 19, 21, 25, 27, 42]
3 False Negatives: [11, 35, 36]
Precision: 0.142857142857
Recall: 0.25

[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  1.  1.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.]
{'min_samples_split': 3, 'criterion': 'gini', 'max_depth': 10}


In [12]:
print 'Grid DT Importance:'
for i in range (0,len(features_list)-1):
    print features_list[i+1], ":", gridClf.best_estimator_.feature_importances_[i]

Grid DT Importance:
salary : 0.0266068589598
bonus : 0.225906362545
total_payments : 0.0
exercised_stock_options : 0.159596453966
shared_receipt_with_poi : 0.0554621848739
expenses : 0.408062609659
email_to_poi_ratio : 0.0245335972227
email_from_poi_ratio : 0.0998319327731
exer_stock_ratio : 0.0


In [13]:
my_classifier_utils.my_k_fold_test(gridClf, features, labels)

K  0 ABC
94 Training Points
training time: 0.25 s
predict time: 0.0 s
accuracy: 0.978723404255
Train Precision: 0.909090909091
Train Recall: 0.909090909091

48 Test Points
predict time: 0.0 s
accuracy: 0.770833333333
11 Wrong ones: [(7, 1.0), (15, 1.0), (20, 1.0), (22, 1.0), (24, 1.0), (26, 0.0), (29, 1.0), (30, 0.0), (33, 0.0), (41, 1.0), (45, 0.0)]
0 True Positives: []
37 True Negatives: [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 21, 23, 25, 27, 28, 31, 32, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 46, 47]
4 False Positives: [26, 30, 33, 45]
7 False Negatives: [7, 15, 20, 22, 24, 29, 41]
Precision: 0.0
Recall: 0.0

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
K  1 ABC
95 Training Points
training time: 0.24 s
predict time: 0.0 s
accuracy: 0.989473684211
Train Precision: 1.0
Train Recall: 0.916666666667

47 Test Points

In [14]:
dump_classifier_and_data(gridClf, my_dataset, features_list)