## Project Overview
In 2000, Enron was one of the largest companies in the United States. By 2002, it had collapsed into bankruptcy due to widespread corporate fraud.  
In the resulting Federal investigation, a significant amount of typically confidential information entered into the public record, including tens of thousands of emails and detailed financial data for top executives.   
   
In this project,I'm going to build a person of interest identifier based on financial and email data made public as a result of the Enron scandal. 

Model performance：Accuracy: 0.85173; Precision: 0.43736; Recall: 0.39100;F1: 0.41288

In [1]:
import os
print os.getcwd()

/Users/weidian1/Documents/GitHub/Udacity-DAND/P5-Intro to ML/Project5/final_project


In [2]:
import sys
import pickle
import pandas as pd
import numpy as np

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)




## Data Explore

In [4]:
# 异常值， 去掉total
def remove_outerliers(data_dict,names):
    for name in names:
        data_dict.pop(name, 0)
    return data_dict

def dateset_summary(data_dic):
    n_poi,n_non_poi = 0,0
    for key in data_dict.keys():
        if data_dict[key]['poi']==1:
            n_poi+=1
        else:
            n_non_poi+=1        
    print "Total number of data points: ", len(data_dict)
    print "Number of POI: %d, no. of non-POI: %d"% (n_poi, n_non_poi)
    print "Number of features used: ",len(data_dict['METTS MARK']) # randomly pick one name, get the number of features.

outliers_names =  ['TOTAL','THE TRAVEL AGENCY IN THE PARK']#, 'LOCKHART EUGENE E']#['TOTAL']#,'LAVORATO JOHN J'

data_dict = remove_outerliers(data_dict,outliers_names)
dateset_summary(dateset_summary)

Total number of data points:  144
Number of POI: 18, no. of non-POI: 126
Number of features used:  21


In [5]:
# 查看缺失情况；poi转为int型
data_df = pd.DataFrame.from_dict(data_dict,orient='index')
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus'
                 , 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses'
                 , 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock'
                 , 'director_fees']
email_fatures = ['to_messages','from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi'
                 , 'shared_receipt_with_poi']  # remove 'email_address'
def NaNs_to_0s(col):
    return [0 if ele=='NaN' else ele for ele in col]

def NaNs_to_None(col):
    return [None if ele=='NaN' else ele for ele in col]

def count_NAN(col):
    return sum(1 for ele in col if ele == 'NaN')

data_df[financial_features] = data_df[financial_features].apply(NaNs_to_0s, axis=0)
data_df['poi'] = data_df['poi'].astype(int)
print data_df.apply(count_NAN, axis=0)   

salary                        0
to_messages                  58
deferral_payments             0
total_payments                0
exercised_stock_options       0
bonus                         0
restricted_stock              0
shared_receipt_with_poi      58
restricted_stock_deferred     0
total_stock_value             0
expenses                      0
loan_advances                 0
from_messages                58
other                         0
from_this_person_to_poi      58
poi                           0
director_fees                 0
deferred_income               0
long_term_incentive           0
email_address                33
from_poi_to_this_person      58
dtype: int64


## Add New Features

In [7]:
"""
make new feature 
1. 'email_features_miss': if email_features is missing. next step, missing email_features will be
transformed to 0, which should by different from missing financial features.   
2. poi_rate_to_messages = from_this_person_to_poi/to_messages  
3. poi_rate_from_messages = from_poi_to_this_person/from_messages
"""

data_df['email_features_miss'] = data_df.apply(lambda x: 1 if x["to_messages"]=='NaN' else 0, axis = 1)

data_df['poi_rate_to_messages'] = data_df.apply(lambda x: 0 if x['from_this_person_to_poi']=='NaN' \
                                                else x['from_this_person_to_poi']*1.0/x['to_messages'], axis=1)

data_df['poi_rate_from_messages'] = data_df.apply(lambda x: 0 if x['from_poi_to_this_person']=='NaN'\
                                                  else x['from_poi_to_this_person']*1.0/x['from_messages'], axis =1)    

In [8]:
print data_df.shape#columns.values; 24个变量
print data_df.columns.values

(144, 24)
['salary' 'to_messages' 'deferral_payments' 'total_payments'
 'exercised_stock_options' 'bonus' 'restricted_stock'
 'shared_receipt_with_poi' 'restricted_stock_deferred' 'total_stock_value'
 'expenses' 'loan_advances' 'from_messages' 'other'
 'from_this_person_to_poi' 'poi' 'director_fees' 'deferred_income'
 'long_term_incentive' 'email_address' 'from_poi_to_this_person'
 'email_features_miss' 'poi_rate_to_messages' 'poi_rate_from_messages']


## Store to my_dataset, my_feature_list

In [10]:
features_full_list = list(data_df.columns.values)
features_full_list.remove('email_address')
features_full_list.remove('poi')
features_full_list = ['poi'] +features_full_list
data_full_dic = data_df.to_dict(orient='index')
print ("%d full_columns_name: %s" % (len(features_full_list), features_full_list))
    
def get_features_labels(my_dataset,features_list):
    #Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # TODO: Shuffle the data
    from sklearn.utils import shuffle
    features, labels = shuffle(features, labels, random_state=1)
    features = np.array(features)
    labels = np.array(labels)
    return features, labels

features, labels = get_features_labels(data_full_dic,features_full_list)

23 full_columns_name: ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person', 'email_features_miss', 'poi_rate_to_messages', 'poi_rate_from_messages']


In [11]:
### Store to my_dataset for easy export below.
my_dataset = data_full_dic
my_feature_list = features_full_list

## Try Several Algorithms

In [42]:
import tester
from tester import test_classifier
tester.test_classifier(tree.DecisionTreeClassifier(), my_dataset, my_feature_list,folds = 1000)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.79540	Precision: 0.23760	Recall: 0.24200	F1: 0.23978	F2: 0.24111
	Total predictions: 15000	True positives:  484	False positives: 1553	False negatives: 1516	True negatives: 11447



In [43]:
tester.test_classifier(GaussianNB(), my_dataset, my_feature_list,folds = 1000)

GaussianNB(priors=None)
	Accuracy: 0.74607	Precision: 0.23467	Recall: 0.40000	F1: 0.29580	F2: 0.35060
	Total predictions: 15000	True positives:  800	False positives: 2609	False negatives: 1200	True negatives: 10391



In [45]:
tester.test_classifier(AdaBoostClassifier(), my_dataset, my_feature_list,folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
	Accuracy: 0.83660	Precision: 0.35536	Recall: 0.27700	F1: 0.31132	F2: 0.28978
	Total predictions: 15000	True positives:  554	False positives: 1005	False negatives: 1446	True negatives: 11995



## Train classifier model and tune parameters
Using AdaBoost, GridSearchCv, Pipeline, StratifiedShuffleSplit.    
define a new metric function 'scorer_r_p' .

In [9]:
from sklearn.ensemble import AdaBoostClassifier  
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier

def scorer_r_p(estimator, features_test, labels_test):
    labels_pred = estimator.predict(features_test)
    pre= precision_score(labels_test, labels_pred, average='micro')
    rec = recall_score(labels_test, labels_pred, average='micro')
    if pre>0.3 and rec>0.3:
        return f1_score(labels_test, labels_pred, average='macro')
    elif  pre>0.3 and rec<0.3:
        return 0.3
    elif rec >0.3 and pre<0.3:
        return 0.3
    return 0

sss = StratifiedShuffleSplit(labels, n_iter=50, test_size = 0.2, random_state=42)
pipe = Pipeline([('scaler',MinMaxScaler(feature_range=(0,1))), 
                ('selector', SelectKBest()),
                  ('classifier', AdaBoostClassifier(random_state=32))])
param_grid = dict( selector__score_func=[chi2,f_classif],
                  selector__k=range(4,11),
                 classifier__n_estimators=[50,80,100]
                 ,classifier__learning_rate=[0.5,1,1.5,2]
                ,classifier__base_estimator=[DecisionTreeClassifier(min_samples_split=2)\
                                                , DecisionTreeClassifier(min_samples_split=3)\
                                                , DecisionTreeClassifier(min_samples_split=4)]\
                 )
clf_grid = GridSearchCV(pipe, param_grid, scoring=scorer_r_p, cv=sss, n_jobs=-1)
clf_grid.fit(features, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=StratifiedShuffleSplit(labels=[ 0.  0. ...,  0.  0.], n_iter=50, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x112ef5de8>)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=32))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'classifier__learning_rate': [0.5, 1, 1.5, 2], 'classifier__base_estimator': [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_...100], 'selector__score_func': [<function chi2 at 0x112ef5ed8>, <function f_classif at 0x112ef5de8>]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function scorer_r_p at 0x11

## Use Given TestSet

In [12]:
import tester
from tester import test_classifier
from sklearn.ensemble import AdaBoostClassifier

tester.test_classifier(clf_grid.best_estimator_, my_dataset, my_feature_list)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selector', SelectKBest(k=4, score_func=<function chi2 at 0x112ef5ed8>)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
   ...ndom_state=None, splitter='best'),
          learning_rate=1.5, n_estimators=100, random_state=32))])
	Accuracy: 0.85173	Precision: 0.43736	Recall: 0.39100	F1: 0.41288	F2: 0.39947
	Total predictions: 15000	True positives:  782	False positives: 1006	False negatives: 1218	True negatives: 11994



## Feature Scores
Final classifier uses top 4 features, print out the feature scores.

In [None]:
#Scale
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
features_scaled = scaler.fit_transform(X=features, y=labels)

In [None]:
#KBest
from sklearn.feature_selection import SelectKBest,chi2
kb = SelectKBest(score_func=chi2,k=4)
kb.fit_transform(features_scaled,labels)
kb.scores_

In [27]:
importances = kb.scores_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(features_scaled .shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f],features_full_list[1:][indices[f]],importances[indices[f]]))


Feature ranking:
1. feature 4 exercised_stock_options (6.927477)
2. feature 11 loan_advances (6.742748)
3. feature 9 total_stock_value (5.544840)
4. feature 5 bonus (5.193349)
5. feature 0 salary (3.116644)
6. feature 3 total_payments (2.817146)
7. feature 17 long_term_incentive (2.579690)
8. feature 7 shared_receipt_with_poi (2.482307)
9. feature 21 poi_rate_from_messages (1.816426)
10. feature 13 other (1.740311)
11. feature 19 email_features_miss (1.665025)
12. feature 10 expenses (1.525657)
13. feature 15 director_fees (1.489217)
14. feature 18 from_poi_to_this_person (1.398512)
15. feature 20 poi_rate_to_messages (1.293839)
16. feature 14 from_this_person_to_poi (1.019639)
17. feature 6 restricted_stock (0.595438)
18. feature 1 to_messages (0.451111)
19. feature 16 deferred_income (0.341726)
20. feature 12 from_messages (0.066522)
21. feature 2 deferral_payments (0.058561)
22. feature 8 restricted_stock_deferred (0.003456)


In [26]:
from sklearn.feature_selection import SelectKBest,chi2
kb = SelectKBest(score_func=chi2,k=4)
kb.fit_transform(features_scaled,labels)
kb.scores_

array([  3.11664385e+00,   4.51110661e-01,   5.85608784e-02,
         2.81714599e+00,   6.92747711e+00,   5.19334926e+00,
         5.95438049e-01,   2.48230712e+00,   3.45572751e-03,
         5.54483965e+00,   1.52565657e+00,   6.74274761e+00,
         6.65219934e-02,   1.74031060e+00,   1.01963921e+00,
         1.48921712e+00,   3.41725794e-01,   2.57968986e+00,
         1.39851239e+00,   1.66502463e+00,   1.29383938e+00,
         1.81642606e+00])

## Dump 
classifier, dataset, and features_list. 


In [14]:
from tester import dump_classifier_and_data
dump_classifier_and_data(clf_grid.best_estimator_, my_dataset, my_feature_list)