In [274]:
%matplotlib inline
import pandas as pd
import ggplot as gg
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix,classification_report,precision_score,recall_score

In [275]:
df=pd.read_csv("./data/final_project_dataset.csv")

In [276]:
ndf=df.drop(["Unnamed: 0","email_address","poi"],axis=1)
#exclude_features=["director_fees","loan_advances","restricted_stock_deferred"]
exclude_features=[]
ndf=ndf.drop(exclude_features,axis=1)
dfmtx=ndf.values
dfmtx.astype(float)
label=df["poi"]
# Fill in NaN
imp=Imputer(axis=0,strategy="median")
ndfmtx=imp.fit_transform(dfmtx)

## Use random forest to select feature

In [277]:
from sklearn.ensemble import RandomForestClassifier
train_X=ndfmtx
train_y=label
rf=RandomForestClassifier()
rf.fit(train_X,train_y)
rfi=rf.feature_importances_

def list_feature_imp(name,score):
    sorted_rfi_idx=np.argsort(score)
    for r in sorted_rfi_idx:
        print(name[r]+":"+str(score[r]))


list_feature_imp(ndf.columns,rfi)

loan_advances:0.0
director_fees:0.0
restricted_stock_deferred:0.0
from_messages:0.0193703803198
from_poi_to_this_person:0.0262332615685
deferral_payments:0.0334919791576
shared_receipt_with_poi:0.0367151734131
to_messages:0.0371500297645
restricted_stock:0.0440677076815
deferred_income:0.0531884885931
total_stock_value:0.0605316094354
other:0.0629335781067
from_this_person_to_poi:0.0652249456732
long_term_incentive:0.0670334409956
expenses:0.0755206952672
total_payments:0.0775628552243
salary:0.0884215997875
bonus:0.11066196095
exercised_stock_options:0.141892294062


From the results of randomforest, the least three important features are ```loan_advances```,```director_fees```,```from_messages```.

## Use LassoCV to select the least important features

In [278]:
from sklearn.linear_model import LassoCV

In [279]:
lcv=LassoCV(max_iter=10000)
lcv.fit(train_X,train_y)



LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=10000, n_alphas=100, n_jobs=1, normalize=False,
    positive=False, precompute='auto', tol=0.0001, verbose=False)

Lasso does note converge!

## Use selectKbest algorithm

In [280]:
from sklearn.feature_selection import SelectKBest
skb=SelectKBest(k=10)
skb.fit(train_X,train_y)
skb.scores_

array([  2.11573384e-02,   2.43871926e-01,   1.55092051e-01,
         8.05412022e-02,   2.42580942e-01,   8.18545601e-02,
         1.74121536e-01,   4.40304395e+00,   2.29677193e+00,
         2.19138564e+00,   2.41834456e-03,   5.93989753e-02,
         2.16056434e-02,   1.46327389e-02,   2.71977079e-02,
         7.66263750e+00,   9.26681010e-01,   3.27068297e-01,
         1.56174205e-01])

In [281]:
list_feature_imp(ndf.columns,skb.scores_)

long_term_incentive:0.00241834456347
restricted_stock_deferred:0.0146327389051
bonus:0.0211573383798
restricted_stock:0.02160564342
salary:0.0271977079099
other:0.0593989753315
director_fees:0.0805412022285
expenses:0.0818545600866
deferred_income:0.15509205058
total_stock_value:0.156174205499
from_messages:0.174121536018
exercised_stock_options:0.242580941876
deferral_payments:0.243871926357
total_payments:0.327068297008
to_messages:0.926681009753
loan_advances:2.19138563992
from_this_person_to_poi:2.29677192902
from_poi_to_this_person:4.40304395123
shared_receipt_with_poi:7.66263749628


## Use PCA to select features 

In [282]:
from sklearn.decomposition import PCA

In [283]:
pca=PCA(n_components=8,whiten=True)
pca_train_X=pca.fit_transform(train_X,train_y)

In [284]:
pca.explained_variance_ratio_

array([  9.75338190e-01,   1.52253022e-02,   6.88808231e-03,
         1.06117972e-03,   5.99022949e-04,   4.94046016e-04,
         1.62503676e-04,   9.87211109e-05])

## Try another fit

In [285]:
sort_idx=np.argsort(skb.scores_)[::-1]
chosen_idx=sort_idx[0:10]
#nndfmtx=np.hstack((ndfmtx[:,chosen_idx],pca_train_X))
#nndfmtx=pca_train_X
nndfmtx=ndfmtx[:,chosen_idx]
ndf.columns[chosen_idx]

Index(['shared_receipt_with_poi', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'to_messages',
       'total_payments', 'deferral_payments', 'exercised_stock_options',
       'from_messages', 'total_stock_value'],
      dtype='object')

In [286]:
def print_result(test_y,pred_y):
    result=classification_report(test_y,pred_y)
    print(result)
    print("precision: %s"%precision_score(test_y,pred_y))
    print("recall: %s"%recall_score(test_y,pred_y))

In [290]:
def try_rf_clf(train_X,train_y,test_X,test_y):
    rf=RandomForestClassifier()
    rf.fit(train_X,train_y)
    pred_y=rf.predict(test_X)
    #print_result(test_y,pred_y)
    return pred_y

def try_nb_clf(train_X,train_y,test_X,test_y):
    lcv=GaussianNB()
    lcv.fit(train_X,train_y)
    pred_y=lcv.predict(test_X)
    #print_result(test_y,pred_y)
    return pred_y
    
def try_lvc_clf(train_X,train_y,test_X,test_y):
    lvc=LinearSVC(C=0.01)
    lvc.fit(train_X,train_y)
    
    dec_y=lvc.decision_function(train_X)
    
    #choose the smallest 90%
    num_sel=int(len(dec_y)*0.9)
    assert len(dec_y)==train_X.shape[0]
    assert num_sel<train_X.shape[0]
    
    print(num_sel)
    s_idx=np.argsort(np.abs(dec_y))
    
    assert len(s_idx)==train_X.shape[0]
    
    print(s_idx)
    print(train_y.shape)
    n_train_X=train_X[s_idx,:]
    n_train_y=train_y[s_idx]
    
    lvc.fit(n_train_X,n_train_y)
    
    pred_y=lvc.predict(test_X)
    return pred_y

Do the validation

In [294]:
sss=StratifiedShuffleSplit(label,n_iter=100,test_size=0.1)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx,test_idx in sss:

    train_X=ndfmtx[train_idx,:]
    test_X=ndfmtx[test_idx,:]
    train_y=label[train_idx]
    test_y=label[test_idx]
    pred_y=try_lvc_clf(train_X,train_y,test_X,test_y)
    
    for prediction, truth in zip(pred_y, test_y):
        if prediction == 0 and truth == 0:
            true_negatives += 1
        elif prediction == 0 and truth == 1:
            false_negatives += 1
        elif prediction == 1 and truth == 0:
            false_positives += 1
        elif prediction == 1 and truth == 1:
            true_positives += 1
        else:
            print("Warning: Found a predicted label not == 0 or 1.")
            print("All predictions should take value 0 or 1.")
            print("Evaluating performance for processed predictions:")
            break

total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)

print("precision: %s"%precision)
print("recall: %s"%recall)
print("f1: %s"%f1)

117
[ 33  13  12  93  81  53  10  80  38  86  85 120 124 110  16  51  62  61
 100  82  23  17  14   1 118  18 127 106   2  25  15  34  22  39   6  56
 123 107  54   8  95  71   5  44  84  55 130  66  29  68  94  99  89  32
  72 117  59  98   4  73  26  60  43  76  31  64  28  77   9 122  11   7
  57  87  36 128  52 108 121 126  41 116 111  20  67  27  63 105   0  75
  50 112 103  91  48  30  90  47  24  83  35 109 129 113  92 115  78   3
  96  46  74  58  69  65 119 114  49  88  45 102  21  40  19 125  79 104
  70  37  42 101  97]
(131,)
All predictions should take value 0 or 1.
Evaluating performance for processed predictions:
117
[  1  60 121  39  10  62  63 108  70  23  16 127 112  19  30 123 109  28
  76 129  13 101 102   0  67  59  47  53 114 103   7  58  64  86  98  74
  20  89  73   2  46 105  44  11  48 106 110  71 126  79  33  54  40 113
  66 100   6 122  91  15  24  32   3  94  29  43  57  42 107  18  65  78
   9  56 128  80  82 120  14 118  45 104 125 115  51  96  77  69  81

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
