## Project Overview
In 2000, Enron was one of the largest companies in the United States. By 2002, it had collapsed into bankruptcy due to widespread corporate fraud.  
In the resulting Federal investigation, a significant amount of typically confidential information entered into the public record, including tens of thousands of emails and detailed financial data for top executives.   
   
In this project,I'm going to build a person of interest identifier based on financial and email data made public as a result of the Enron scandal. 

Model performance：Accuracy: 0.85173; Precision: 0.43736; Recall: 0.39100;F1: 0.41288

In [1]:
import os
print os.getcwd()

/Users/weidian1/Documents/GitHub/Udacity-DAND/P5-Intro to ML/Project5/final_project


In [2]:
import sys
import pickle
import pandas as pd
import numpy as np

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)




## Data Explore

In [4]:
# 异常值， 去掉total
def remove_outerliers(data_dict,names):
    for name in names:
        data_dict.pop(name, 0)
    return data_dict

def dateset_summary(data_dic):
    n_poi,n_non_poi = 0,0
    for key in data_dict.keys():
        if data_dict[key]['poi']==1:
            n_poi+=1
        else:
            n_non_poi+=1        
    print "Total number of data points: ", len(data_dict)
    print "Number of POI: %d, no. of non-POI: %d"% (n_poi, n_non_poi)
    print "Number of features used: ",len(data_dict['METTS MARK']) # randomly pick one name, get the number of features.

outliers_names =  ['TOTAL','THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']

data_dict = remove_outerliers(data_dict,outliers_names)
dateset_summary(dateset_summary)

Total number of data points:  143
Number of POI: 18, no. of non-POI: 125
Number of features used:  21


In [5]:
# 查看缺失情况；poi转为int型
data_df = pd.DataFrame.from_dict(data_dict,orient='index')
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus'
                 , 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses'
                 , 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock'
                 , 'director_fees']
email_fatures = ['to_messages','from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi'
                 , 'shared_receipt_with_poi']  # remove 'email_address'
def NaNs_to_0s(col):
    return [0 if ele=='NaN' else ele for ele in col]

def NaNs_to_None(col):
    return [None if ele=='NaN' else ele for ele in col]

def count_NAN(col):
    return sum(1 for ele in col if ele == 'NaN')

data_df[financial_features] = data_df[financial_features].apply(NaNs_to_0s, axis=0)
data_df['poi'] = data_df['poi'].astype(int)
print data_df.apply(count_NAN, axis=0)   

salary                        0
to_messages                  57
deferral_payments             0
total_payments                0
exercised_stock_options       0
bonus                         0
restricted_stock              0
shared_receipt_with_poi      57
restricted_stock_deferred     0
total_stock_value             0
expenses                      0
loan_advances                 0
from_messages                57
other                         0
from_this_person_to_poi      57
poi                           0
director_fees                 0
deferred_income               0
long_term_incentive           0
email_address                32
from_poi_to_this_person      57
dtype: int64


## Add New Features

In [6]:
"""
make new feature 
1. 'email_features_miss': if email_features is missing. next step, missing email_features will be
transformed to 0, which should by different from missing financial features.   
2. poi_rate_to_messages = from_this_person_to_poi/to_messages  
3. poi_rate_from_messages = from_poi_to_this_person/from_messages
"""

data_df['email_features_miss'] = data_df.apply(lambda x: 1 if x["to_messages"]=='NaN' else 0, axis = 1)

data_df['poi_rate_to_messages'] = data_df.apply(lambda x: 0 if x['from_this_person_to_poi']=='NaN' \
                                                else x['from_this_person_to_poi']*1.0/x['to_messages'], axis=1)

data_df['poi_rate_from_messages'] = data_df.apply(lambda x: 0 if x['from_poi_to_this_person']=='NaN'\
                                                  else x['from_poi_to_this_person']*1.0/x['from_messages'], axis =1)    

In [7]:
print data_df.shape#columns.values; 24个变量
print data_df.columns.values

(143, 24)
['salary' 'to_messages' 'deferral_payments' 'total_payments'
 'exercised_stock_options' 'bonus' 'restricted_stock'
 'shared_receipt_with_poi' 'restricted_stock_deferred' 'total_stock_value'
 'expenses' 'loan_advances' 'from_messages' 'other'
 'from_this_person_to_poi' 'poi' 'director_fees' 'deferred_income'
 'long_term_incentive' 'email_address' 'from_poi_to_this_person'
 'email_features_miss' 'poi_rate_to_messages' 'poi_rate_from_messages']


## Store to my_dataset, my_feature_list

In [8]:
features_full_list = list(data_df.columns.values)
features_full_list.remove('email_address')
features_full_list.remove('poi')
features_full_list = ['poi'] +features_full_list
data_full_dic = data_df.to_dict(orient='index')
print ("%d full_columns_name: %s" % (len(features_full_list), features_full_list))
    
def get_features_labels(my_dataset,features_list):
    #Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # TODO: Shuffle the data
    from sklearn.utils import shuffle
    features, labels = shuffle(features, labels, random_state=1)
    features = np.array(features)
    labels = np.array(labels)
    return features, labels

features, labels = get_features_labels(data_full_dic,features_full_list)

23 full_columns_name: ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person', 'email_features_miss', 'poi_rate_to_messages', 'poi_rate_from_messages']


In [9]:
### Store to my_dataset for easy export below.
my_dataset = data_full_dic
my_feature_list = features_full_list

## Try Several Algorithms

In [12]:
import tester
from tester import test_classifier
from sklearn.tree import DecisionTreeClassifier
tester.test_classifier(DecisionTreeClassifier(), my_dataset, my_feature_list,folds = 1000)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.79713	Precision: 0.23781	Recall: 0.23650	F1: 0.23715	F2: 0.23676
	Total predictions: 15000	True positives:  473	False positives: 1516	False negatives: 1527	True negatives: 11484



In [15]:
from sklearn.naive_bayes import GaussianNB
tester.test_classifier(GaussianNB(), my_dataset, my_feature_list,folds = 1000)

GaussianNB(priors=None)
	Accuracy: 0.73900	Precision: 0.22604	Recall: 0.39500	F1: 0.28753	F2: 0.34363
	Total predictions: 15000	True positives:  790	False positives: 2705	False negatives: 1210	True negatives: 10295



In [16]:
from sklearn.ensemble import AdaBoostClassifier
tester.test_classifier(AdaBoostClassifier(), my_dataset, my_feature_list,folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
	Accuracy: 0.83400	Precision: 0.34572	Recall: 0.27450	F1: 0.30602	F2: 0.28630
	Total predictions: 15000	True positives:  549	False positives: 1039	False negatives: 1451	True negatives: 11961



## Train classifier model and tune parameters
Using AdaBoost, GridSearchCv, Pipeline, StratifiedShuffleSplit.    
define a new metric function 'scorer_r_p' .

In [67]:
from sklearn.ensemble import AdaBoostClassifier  
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score
from sklearn.tree import DecisionTreeClassifier

def scorer_r_p(estimator, features_test, labels_test):
    labels_pred = estimator.predict(features_test)
#     pre= precision_score(labels_test, labels_pred)
#     rec = recall_score(labels_test, labels_pred)
#     if pre>0.3 and rec>0.3:
#        return fbeta_score(labels_test, labels_pred, beta=1.1)
#     elif  pre>0.3 and rec<0.3:
#        return 0.3
#     elif rec >0.3 and pre<0.3:
#        return 0.3
    return f1_score(labels_test, labels_pred)

sss = StratifiedShuffleSplit(labels, n_iter=50, test_size = 0.2, random_state=42)
pipe = Pipeline([('scaler',MinMaxScaler(feature_range=(0,1))), 
                ('selector', SelectKBest()),
                  ('classifier', AdaBoostClassifier(random_state=32))])
param_grid = dict( selector__score_func=[chi2,f_classif],
                  selector__k=range(4,6),
                 classifier__n_estimators=[50,80,100]
                 ,classifier__learning_rate=[1,1.5,2]
                ,classifier__base_estimator=[DecisionTreeClassifier(min_samples_split=2)\
                                                , DecisionTreeClassifier(min_samples_split=3)\
                                                , DecisionTreeClassifier(min_samples_split=4)]\
                 )
clf_grid = GridSearchCV(pipe, param_grid, scoring=scorer_r_p, cv=sss, n_jobs=-1)
clf_grid.fit(features, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=StratifiedShuffleSplit(labels=[ 0.  0. ...,  0.  0.], n_iter=50, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x113c272a8>)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=32))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'classifier__learning_rate': [1, 1.5, 2], 'classifier__base_estimator': [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split...100], 'selector__score_func': [<function chi2 at 0x113c27398>, <function f_classif at 0x113c272a8>]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function scorer_r_p at 0x11

In [68]:
clf_grid.best_estimator_.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('selector', SelectKBest(k=4, score_func=<function chi2 at 0x113c27398>)),
 ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
            base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=4, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
            learning_rate=1.5, n_estimators=50, random_state=32))]

In [69]:
clf_grid.best_score_ #f1

0.40955067155067165

In [46]:
clf_grid.best_estimator_.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('selector', SelectKBest(k=4, score_func=<function chi2 at 0x113c27398>)),
 ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
            base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=4, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
            learning_rate=1.5, n_estimators=50, random_state=32))]

In [47]:
clf_grid.best_score_

0.39107475361810556

## Use Given TestSet

In [27]:
import tester
from tester import test_classifier
from sklearn.ensemble import AdaBoostClassifier

tester.test_classifier(clf_grid.best_estimator_, my_dataset, my_feature_list)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selector', SelectKBest(k=5, score_func=<function chi2 at 0x113c27398>)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
   ...ndom_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=100, random_state=32))])
	Accuracy: 0.85800	Precision: 0.45638	Recall: 0.34000	F1: 0.38968	F2: 0.35827
	Total predictions: 15000	True positives:  680	False positives:  810	False negatives: 1320	True negatives: 12190



In [44]:
tester.test_classifier(clf_grid.best_estimator_, my_dataset, my_feature_list,folds=1000)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selector', SelectKBest(k=4, score_func=<function chi2 at 0x113c27398>)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
   ...andom_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=50, random_state=32))])
	Accuracy: 0.85187	Precision: 0.43771	Recall: 0.39000	F1: 0.41248	F2: 0.39869
	Total predictions: 15000	True positives:  780	False positives: 1002	False negatives: 1220	True negatives: 11998



In [62]:
#evaluate performance of model added three new feature
clf_addNew = Pipeline(steps=[clf_grid.best_estimator_.steps[i] for i in [0,2]])      
my_feature_list_addNew = ['poi','exercised_stock_options','loan_advances','total_stock_value'\
                          ,'bonus','email_features_miss','poi_rate_to_messages','poi_rate_from_messages']
tester.test_classifier(clf_addNew, my_dataset,my_feature_list_addNew )

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_s...andom_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=50, random_state=32))])
	Accuracy: 0.87327	Precision: 0.53263	Recall: 0.40400	F1: 0.45948	F2: 0.42450
	Total predictions: 15000	True positives:  808	False positives:  709	False negatives: 1192	True negatives: 12291



In [64]:
# with all features
clf_addNew = Pipeline(steps=[clf_grid.best_estimator_.steps[i] for i in [0,2]])      
tester.test_classifier(clf_addNew, my_dataset,my_feature_list )

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_s...andom_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=50, random_state=32))])
	Accuracy: 0.83367	Precision: 0.30435	Recall: 0.19250	F1: 0.23583	F2: 0.20777
	Total predictions: 15000	True positives:  385	False positives:  880	False negatives: 1615	True negatives: 12120



## Feature Scores
Final classifier uses top 4 features, print out the feature scores.

In [28]:
#Scale
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
features_scaled = scaler.fit_transform(X=features, y=labels)

In [29]:
#KBest
from sklearn.feature_selection import SelectKBest,chi2
kb = SelectKBest(score_func=chi2,k=4)
kb.fit_transform(features_scaled,labels)
kb.scores_

array([  3.05278674e+00,   4.36397769e-01,   6.06966069e-02,
         2.78477884e+00,   6.84550934e+00,   5.12075414e+00,
         5.89535349e-01,   2.43221987e+00,   3.50676503e-03,
         5.47661010e+00,   1.48610337e+00,   6.68878174e+00,
         6.87385422e-02,   1.71595053e+00,   1.00080764e+00,
         1.50113085e+00,   3.40099218e-01,   2.53848503e+00,
         1.37005929e+00,   1.60714230e+00,   1.26978364e+00,
         1.78530514e+00])

In [30]:
importances = kb.scores_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(features_scaled .shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f],features_full_list[1:][indices[f]],importances[indices[f]]))


Feature ranking:
1. feature 4 exercised_stock_options (6.845509)
2. feature 11 loan_advances (6.688782)
3. feature 9 total_stock_value (5.476610)
4. feature 5 bonus (5.120754)
5. feature 0 salary (3.052787)
6. feature 3 total_payments (2.784779)
7. feature 17 long_term_incentive (2.538485)
8. feature 7 shared_receipt_with_poi (2.432220)
9. feature 21 poi_rate_from_messages (1.785305)
10. feature 13 other (1.715951)
11. feature 19 email_features_miss (1.607142)
12. feature 15 director_fees (1.501131)
13. feature 10 expenses (1.486103)
14. feature 18 from_poi_to_this_person (1.370059)
15. feature 20 poi_rate_to_messages (1.269784)
16. feature 14 from_this_person_to_poi (1.000808)
17. feature 6 restricted_stock (0.589535)
18. feature 1 to_messages (0.436398)
19. feature 16 deferred_income (0.340099)
20. feature 12 from_messages (0.068739)
21. feature 2 deferral_payments (0.060697)
22. feature 8 restricted_stock_deferred (0.003507)


In [None]:
from sklearn.feature_selection import SelectKBest,chi2
kb = SelectKBest(score_func=chi2,k=4)
kb.fit_transform(features_scaled,labels)
kb.scores_

## Dump 
classifier, dataset, and features_list. 


In [None]:
from tester import dump_classifier_and_data
dump_classifier_and_data(clf_grid.best_estimator_, my_dataset, my_feature_list)