In [1]:
#!/usr/bin/python

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
%matplotlib notebook
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import my_data_utils

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] 
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)



In [3]:
def maxPrinter(data_dict, sub_field):
    sub_dict = {key:data_dict[key][sub_field] for key in data_dict if data_dict[key][sub_field] != 'NaN'}
    max_key = max(sub_dict, key=sub_dict.get)
    print "Max", sub_field, max_key, sub_dict[ max_key ]

def removeMax(data_dict, sub_field):
    sub_dict = {key:data_dict[key][sub_field] for key in data_dict if data_dict[key][sub_field] != 'NaN'}
    max_val = max(sub_dict, key=sub_dict.get) 
    data_dict.pop(max_val, 0)

maxPrinter(data_dict, 'salary')
print 'remove max'
removeMax(data_dict, 'salary')
maxPrinter(data_dict, 'salary')

Max salary TOTAL 26704229
remove max
Max salary SKILLING JEFFREY K 1111258


In [4]:
### Task 3: Create new feature(s)
#email_to_poi_ratio
#email_from_poi_ratio
for key in data_dict.keys():
    email_to = data_dict[key]['to_messages']
    to_poi = data_dict[key]['from_this_person_to_poi']
    email_from = data_dict[key]['from_messages']
    from_poi = data_dict[key]['from_poi_to_this_person']
    exer_stock_opt = data_dict[key]['exercised_stock_options']
    total_stock = data_dict[key]['total_stock_value']
    
    if 'NaN' not in (email_to, to_poi):
        data_dict[key]['email_to_poi_ratio'] = float(to_poi)/float(email_to)
    else:
        data_dict[key]['email_to_poi_ratio'] = 'NaN'
    if 'NaN' not in (email_from, from_poi):
        data_dict[key]['email_from_poi_ratio'] = float(from_poi)/float(email_from)
    else:
        data_dict[key]['email_from_poi_ratio'] = 'NaN'
    if 'NaN' not in (exer_stock_opt, total_stock):
        data_dict[key]['exer_stock_ratio'] = float(exer_stock_opt)/float(total_stock)
    else:
        data_dict[key]['exer_stock_ratio'] = 'NaN'

In [5]:
def my_k_fold_test_short (classifier, features, labels, kval=10):
    from sklearn.model_selection import KFold
    k_fold = KFold(kval)

    #print "Iteration: precision, recall, f1"
    precision = []
    recall = []
    f1 = []

    for k, (train, test) in enumerate(k_fold.split(features, labels)):
        scores = scoreClassifier (classifier,
                                     [features[ii] for ii in train], [features[ii] for ii in test],
                                     [labels[ii] for ii in train], [labels[ii] for ii in test]) 
        #print "#", k,":", scores
        precision.append(scores[0])
        recall.append(scores[1])
        f1.append(scores[2])
    avPrecision = sum(precision)/kval
    avRecall = sum(recall)/kval
    avF1 = sum(f1)/kval
    print 'Precision', avPrecision, ', Recall', avRecall, ', F1', avF1

def scoreClassifier(classifier, features_train, features_test, labels_train, labels_test):
    from sklearn import metrics
    classifier.fit(features_train, labels_train)
    test_pred = classifier.predict(features_test)
    precision = metrics.precision_score(labels_test, test_pred)
    recall = metrics.recall_score(labels_test, test_pred)  
    f1 = metrics.f1_score(labels_test, test_pred)
    return (precision, recall, f1)

In [9]:
warnings.filterwarnings("ignore", category=DeprecationWarning) 
def get_best_decision_tree(data_dict, features_list):
    print "+++++++++++++++++++++++++++++++++++++"
    print "Returning best Grid Search Decision Tree Classifier"
    print 'Features:', features_list
    my_dataset = data_dict
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)    
    ### base test/train split for simple 
    from sklearn.cross_validation import train_test_split
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42, stratify=labels)

    # Set up cross validator (will be used for tuning all classifiers)
    from sklearn import cross_validation
    cv = cross_validation.StratifiedShuffleSplit(labels_train, test_size=.1, random_state = 42)

    ### Test Decision Trees 
    from sklearn import tree
    parameters = {'max_depth':[2,3,5,8,10,15], 'min_samples_split':[2,3,5], 'criterion' : ['gini','entropy'],
                  'class_weight': [{True: 12, False: 1}, {True: 4, False: 1}, 'auto', None]}
    tempDTClf = tree.DecisionTreeClassifier()
    from sklearn.model_selection import GridSearchCV
    gridClf = GridSearchCV(tempDTClf, parameters, scoring='f1', cv=cv)
    best_dt = gridClf.fit(features_train, labels_train).best_estimator_
    my_k_fold_test_short(best_dt, features, labels)
    return best_dt

features_list_importance = ['poi', 'other', 'expenses', 'total_stock_value', 'exercised_stock_options',
                            'long_term_incentive', 'from_this_person_to_poi', 'from_messages', 'restricted_stock']    
best_dt = get_best_decision_tree(data_dict, features_list_importance)

+++++++++++++++++++++++++++++++++++++
Returning best Grid Search Decision Tree Classifier
Features: ['poi', 'other', 'expenses', 'total_stock_value', 'exercised_stock_options', 'long_term_incentive', 'from_this_person_to_poi', 'from_messages', 'restricted_stock']
Precision 0.341666666667 , Recall 0.825 , F1 0.444047619048


In [10]:
dump_classifier_and_data(best_dt, data_dict, features_list_importance)

In [None]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)

if __name__ == '__main__':
    main()
