In [54]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
    
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)   
    features, labels = shuffle(features, labels, random_state=1)
    
    features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=0)
    features_train= np.array(features_train) 
    features_test = np.array(features_test)
    #labels_train = np.array(labels_train)
    #labels_test = np.array(labels_test)
    #print labels_train#.shape
    
    clf.fit(features_train, labels_train)
    predictions = clf.predict(features_test)

    from sklearn.metrics import recall_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import f1_score
    
    print "accuracy: ",clf.score(features_test, labels_test) 
    print "recall_score: ",recall_score(labels_test, predictions)
    print "precision_score: ",precision_score(labels_test, predictions)  
    print "f1_score: ",f1_score(labels_test, predictions)  
  

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    feature_list
    ### Run testing script
    test_classifier(clf, dataset, feature_list)

if __name__ == '__main__':
    main()


accuracy:  0.794871794872
recall_score:  0.166666666667
precision_score:  0.25
f1_score:  0.2


In [32]:
### load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()


In [34]:
clf.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('selector',
  SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=32,
             verbose=0, warm_start=False),
          prefit=False, threshold='mean')),
 ('AdaBoostClassifier',
  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
            learning_rate=1.0, n_estimators=50, random_state=None))]

In [35]:
feature_list

['poi',
 'salary',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'total_stock_value',
 'other',
 'from_this_person_to_poi',
 'deferred_income',
 'from_poi_to_this_person']

In [None]:

### Run testing script
test_classifier(clf, dataset, feature_list)

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score

def scorer_r_p(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)
    r_score = recall_score(y_test, y_pred)
    p_score = precision(y_test, y_pred)
    f1_score = precision(y_test, y_pred)
    if r_score<0.3 or p_score<0.3:
        return 0
    elif:
        return f1_score