In [1]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from sklearn import model_selection
from time import time
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
    
# Preprocessing
from sklearn.preprocessing import  MaxAbsScaler, StandardScaler, MinMaxScaler


#Models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

#Feature selection
from sklearn.decomposition import PCA,RandomizedPCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest,SelectPercentile

#Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score,classification_report

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV




In [2]:
target_label = 'poi'                
email_features_list = [
    # 'email_address', # remit email address; informational label
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'shared_receipt_with_poi',
    'to_messages',
    ]

financial_features_list = [
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    'exercised_stock_options',
    'expenses',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary'
    #,
    #'total_payments',
    #'total_stock_value',
]
features_list = [target_label] + financial_features_list  
#features_list = [target_label] + financial_features_list  + email_features_list 

In [3]:
with open("P:/Nanodegree/ML/ud120-projects/tools/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

outliers = ["TOTAL", "THE TRAVEL AGENCY IN THE PARK", "LOCKHART EUGENE E", "CHAN RONNIE"]
for outlier in outliers :
    data_dict.pop(outlier, 0)


def update_dict_value(key, items, values, dict_obj):
    index = 0
    for item in items:     
        dict_obj[key][item] = values[index]
        index += 1
    return dict_obj
        

    
data_dict = update_dict_value(
              'BELFER ROBERT',
              ['deferred_income','deferral_payments', 'expenses', 
               'director_fees', 'total_payments', 'exercised_stock_options',
               'restricted_stock','restricted_stock_deferred',
               'total_stock_value'], 
              [-102500,'NaN',3285,102500, 3285,'NaN', 44093,-44093,'NaN'],
              data_dict)


data_dict = update_dict_value(
              'BHATNAGAR SANJAY',
              ['other', 'expenses', 'director_fees', 'total_payments',
               'exercised_stock_options','restricted_stock',
               'restricted_stock_deferred','total_stock_value'],
              ['NaN',137864, 'NaN', 137864, 15456290, 
               2604490, -2604490, 15456290],
               data_dict)


In [4]:
for key in data_dict:
    key_values = data_dict[key]

    total_msg = (data_dict[key]['to_messages'] + 
                 data_dict[key]['from_messages'])
    
    total_poi_msg = (data_dict[key]['from_poi_to_this_person'] +
                     data_dict[key]['from_this_person_to_poi'] + 
                     data_dict[key]['shared_receipt_with_poi'])     
        
    try:
        data_dict[key]['message_poi_ratio'] = (float(total_poi_msg) / 
                                           float(total_msg))
    except:
        data_dict[key]['message_poi_ratio'] = "NaN"
        
    try:
        data_dict[key]['message_others_ratio'] = ((float(total_msg) - float(total_poi_msg)) / 
                                          float(total_msg))
    except:
        data_dict[key]['message_others_ratio'] = "NaN"

features_list = features_list + ['message_poi_ratio','message_others_ratio'] 


In [5]:
#features_list = features_list + email_features_list + ['message_poi_ratio','message_others_ratio'] 
features_list = features_list + ['message_poi_ratio','message_others_ratio'] 
# Store to my_dataset for easy export below.
my_dataset = data_dict

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, remove_NaN=True, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [6]:

# transformed version of X
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(features)

# Split the dataset into train and test
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, 
                                                                                            labels,  
                                                                                            test_size=0.3, 
                                                                                            random_state=42)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_scaled, 
                                                                    labels,  
                                                                    test_size=0.3, 
                                                                    random_state=42)

Since tje features have quite differente value ranges and some of them are discrete and some of them take continous values, I need to scale them first. Removing mean and dividing the standard deviation o features respectively, this one of the most commonly used preprocessing step.

# Selecting features

In [7]:
from sklearn.decomposition import PCA,RandomizedPCA, TruncatedSVD
TruncatedSVD(n_components=10).fit(features_train).explained_variance_ratio_.sum()

0.99997223540161428

In [149]:
"""
from sklearn.metrics import classification_report
#target_names = ["campo1"]
print(classification_report(y_test, preds))
"""

'\nfrom sklearn.metrics import classification_report\n#target_names = ["campo1"]\nprint(classification_report(y_test, preds))\n'

In [8]:
pca = PCA(n_components=14)
pca.fit_transform(X_train)
pca_df = pd.DataFrame(zip(features_list[1:],
                          np.round(pca.explained_variance_ratio_, decimals=7)*100), 
                      columns=['feature','variance_ratio'])
pca_df.sort_values(by='variance_ratio',ascending = False)

Unnamed: 0,feature,variance_ratio
0,bonus,39.25279
1,deferral_payments,21.03506
2,deferred_income,11.3632
3,director_fees,6.63091
4,exercised_stock_options,5.1012
5,expenses,4.43445
6,loan_advances,3.20898
7,long_term_incentive,1.80918
8,other,1.63966
9,restricted_stock,1.23159


In [9]:
for i in range(1,15):
    pca = PCA(svd_solver='auto', n_components=i)
    x = pca.fit(features_train).explained_variance_ratio_.sum()
    print i,":", x    

1 : 0.861553701723
2 : 0.950131933471
3 : 0.972382524142
4 : 0.984291137831
5 : 0.992901577692
6 : 0.997163161022
7 : 0.998629826373
8 : 0.999743007011
9 : 0.999879791085
10 : 0.99997474527
11 : 0.999991327915
12 : 0.999999941617
13 : 0.999999986864
14 : 0.999999999037


In this decomposition, the vector array provided by ratio indicates that most of the information is concentrated into the first compontent(x%). You saw this same sort of result after the factor analysis. it's therefore possible to reduce the entire dataset to just two componentes, providing a reduction of noise and redundant information from the original dataset.

In [16]:
from sklearn.feature_selection import chi2, f_regression
selector = SelectKBest(chi2, k='all').fit(X_train, y_train)

# ANOVA F-value between label/feature for classification tasks
k_best = SelectKBest(f_regression,k='all').fit(features_train, labels_train)
k_best_scaled = SelectKBest(k='all').fit(X_train, y_train)


# Format values
kbest_pd = pd.DataFrame(zip(features_list[1:], 
                            k_best_scaled.scores_, # scaled
                            k_best.scores_, # K best score                            
                            selector.scores_), # chi2
                        columns = ['feature','anova_scaled','anova', 'chi2'])

kbest_pd.sort_values(by='anova_scaled',ascending = False)

Unnamed: 0,feature,anova_scaled,anova,chi2
0,bonus,36.7789,36.7789,6.683411
12,message_poi_ratio,16.448332,16.448332,5.439881
11,salary,16.279541,16.279541,2.92378
4,exercised_stock_options,7.836464,7.836464,2.236481
6,loan_advances,6.954889,6.954889,6.369138
7,long_term_incentive,6.468488,6.468488,1.797035
2,deferred_income,6.298692,6.298692,0.149035
5,expenses,5.562326,5.562326,1.251584
9,restricted_stock,4.920009,4.920009,1.721534
8,other,2.716646,2.716646,1.243399


# Scaling data

# pipeline

In [12]:
select = SelectKBest(k=10)
clf = RandomForestClassifier()

steps = [('feature_selection', select),
        ('random_forest', clf)]

pipeline = Pipeline(steps)

#X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(features, 
#                                                                             labels, test_size=0.33, random_state=42)

### fit your pipeline on X_train and y_train
pipeline.fit( X_train, y_train )
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )
### test your predictions using sklearn.classification_report()
report = classification_report( y_test, y_prediction )
### and print the report
print(report)

             precision    recall  f1-score   support

        0.0       0.90      0.95      0.92        38
        1.0       0.33      0.20      0.25         5

avg / total       0.83      0.86      0.84        43



# Testing models

In [13]:
classifier = [GaussianNB(), 
              DecisionTreeClassifier(), 
              RandomForestClassifier(), 
              KNeighborsClassifier(n_neighbors=4),
              AdaBoostClassifier(),
              LogisticRegression()]
classifier_name = ['Naive Bayes', 
                   'Decision Tree', 
                   'Random Forest',
                   'KNeighbors',
                   'AdaBoost',
                   'Logistic Regression']

accuracy_model = []
for clf, name in zip(classifier, classifier_name):    
    #score = cross_val_score(clf, X_train, y_train)
    score = cross_val_score(clf, features, labels, scoring='accuracy')
    accuracy_model.append([name,score[0],score[1],score[2],score.mean()])
  
pd.DataFrame(accuracy_model,columns=('Model', 
                                     'Score1', 
                                     'Score2',
                                     'Score3',
                                     'Mean')).sort_values(by='Mean',ascending = False)

Unnamed: 0,Model,Score1,Score2,Score3,Mean
3,KNeighbors,0.875,0.87234,0.87234,0.873227
4,AdaBoost,0.770833,0.914894,0.914894,0.866874
2,Random Forest,0.8125,0.851064,0.893617,0.852394
5,Logistic Regression,0.770833,0.765957,0.808511,0.781767
1,Decision Tree,0.75,0.744681,0.829787,0.774823
0,Naive Bayes,0.875,0.765957,0.382979,0.674645


I put the Logistic Regression classifier even knowing it applys in for a continuous out which is not our case. As expected this classifier don't have the best performance.

In [25]:
#FeatureUnion([("pca", pca),
pca = PCA(n_components=10)

selection = SelectKBest(k=10)

combined_features = FeatureUnion([("pca", pca),
                                  ("univ_select",selection)])

pipeline =   Pipeline([('features', combined_features),   
                       ('rfr', RandomForestClassifier())
                 ])
"""
pipeline =   Pipeline([('features', SelectKBest()),   
                       ('rfr', RandomForestClassifier())
                 ])
"""
pipeline.fit_transform(features_train, labels_train)
pred = pipeline.predict(features_test)
print 'Accuracy:', accuracy_score(labels_test, pred)
print 'Recall:', recall_score(labels_test, pred)
print 'Precision:', precision_score(labels_test, pred)
#test_classifier(grid.best_estimator_, my_dataset, features_list)


             precision    recall  f1-score   support

        0.0       0.90      0.97      0.94        38
        1.0       0.50      0.20      0.29         5

avg / total       0.86      0.88      0.86        43





# Tuning RandomForestClassifier()

In [12]:
pca = PCA(n_components=10)

selection = SelectKBest(k=10)

"""
combined_features = FeatureUnion([("pca", pca),
                                  ("univ_select",selection)])

pipeline =   Pipeline([('features', combined_features),   
                       ('classify', RandomForestClassifier())
                 ])
"""
pipeline =   Pipeline([('scale', MinMaxScaler(feature_range=(0, 1))),
                       ('features', selection),   
                       ('classify', RandomForestClassifier())
                      ])

param_grid = {'scale': [None, MaxAbsScaler()],
              "classify__max_depth": [5, 3, 1],
              "classify__max_features": [2,1],
              "classify__min_samples_leaf": [1, 3, 10,15],
              "classify__bootstrap": [True, False],
              "classify__criterion": ["gini", "entropy"]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='recall')

grid.fit(X_train, y_train)
grid.best_score_
clf =  grid.best_estimator_

In [13]:
test_classifier(clf, my_dataset, features_list)

  f = msb / msw


Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('features', SelectKBest(k=10, score_func=<function f_classif at 0x000000000C1F3668>)), ('classify', RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=5, max_features=2,
  ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
	Accuracy: 0.85227	Precision: 0.34164	Recall: 0.11650	F1: 0.17375	F2: 0.13419
	Total predictions: 15000	True positives:  233	False positives:  449	False negatives: 1767	True negatives: 12551



# Tuning GradientBoostingClassifier

In [9]:
pipeline = Pipeline([('scale', MinMaxScaler(feature_range=(0, 1))),
                     ('selection', SelectKBest()),
                     ('classify', GradientBoostingClassifier(random_state=42))])

param_grid = {
        'scale': [MinMaxScaler(feature_range=(0, 1))],
        'selection__k': [10],
        'classify__criterion': ['mae', 'friedman_mse', 'mse'],
        'classify__learning_rate':[1.0, 0.1],
        'classify__min_samples_leaf': [3],
        'classify__max_leaf_nodes': [100]
        #'classify__loss' : ['deviance']
    
        #'classify__max_leaf_nodes': [100, 150],
        #'classify__max_features': ['sqrt', 0.50, 0.80] 
        #'classify__subsample': [0.8, 1.0]
        #'classify__loss' : ['exponential']
        
    }
grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1')

grid.fit(features_train, labels_train)
predicted = grid.predict(features_test)
test_classifier(grid.best_estimator_, my_dataset, features_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=10, score_func=<function f_classif at 0x000000000C23C668>)), ('classify', GradientBoostingClassifier(criterion='mae', init=None, learning_rate=1.0,
              loss='deviance', max_depth=3, max_features=No...        presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False))])
	Accuracy: 0.80667	Precision: 0.29282	Recall: 0.31800	F1: 0.30489	F2: 0.31262
	Total predictions: 15000	True positives:  636	False positives: 1536	False negatives: 1364	True negatives: 11464



# Tuning KNeighborsClassifier

In [66]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler(with_mean=False)),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           #('selection', SelectKBest()),
                           ('classifier', KNeighborsClassifier())
                          ])

param_grid = {'minmaxer' : [None, StandardScaler(with_mean=False)],
              #'selection__k': [6],
              #'reduce_dim__n_components': [6],
              #'classifier__metric': ["euclidean", "cityblock", 'minkowski'],
              'classifier__n_neighbors' : [4,6,10,14],
              'classifier__weights' : ['uniform','distance']
             }

grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1')

grid.fit(features_train, labels_train)

clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance'))])
	Accuracy: 0.84100	Precision: 0.31220	Recall: 0.16000	F1: 0.21157	F2: 0.17729
	Total predictions: 15000	True positives:  320	False positives:  705	False negatives: 1680	True negatives: 12295



# Tuning NearestCentroid

In [109]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler()),
                           ('reduce_dim', PCA(copy=True, random_state=42)),
                           ('selection', SelectKBest()),
                           ('classifier', NearestCentroid())
                          ])

param_grid = {'minmaxer' : [None, StandardScaler()],
              'selection__k': [6],
              'reduce_dim__n_components': [6],
              'classifier__metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan','correlation', 'minkowski'],
              'classifier__shrink_threshold'  : [None, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,1000.0]
         }
#scv = StratifiedShuffleSplit(labels_train, 1000, random_state = 42)

grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1', n_jobs=-1)

grid.fit(features_train, labels_train)

clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=6, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C695668>)), ('classifier', NearestCentroid(metric='euclidean', shrink_threshold=None))])
	Accuracy: 0.80133	Precision: 0.35250	Recall: 0.58550	F1: 0.44006	F2: 0.51713
	Total predictions: 15000	True positives: 1171	False positives: 2151	False negatives:  829	True negatives: 10849



# Tuning AdaBoostClassifier

In [113]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler(with_mean=False)),
                           #('selection', SelectKBest()),
                           #('reduce',PCA(n_components=6, random_state=42)),
                           ('classifier', AdaBoostClassifier(random_state=42))
                               ])
params = {
          #'selection__k': [6],
          'classifier__base_estimator' : [DecisionTreeClassifier(max_features=2, criterion="gini", max_leaf_nodes=100)], 
          'classifier__n_estimators': [150,200],
          'classifier__learning_rate' :[0.1, 1.0],
          'classifier__algorithm' : ['SAMME.R', 'SAMME']
               }

#scv = StratifiedShuffleSplit(features_train, 1000, random_state = 42)

# set up gridsearch
grid = GridSearchCV(pipeline, param_grid = params,scoring = 'accuracy', cv=10)
grid.fit(features_train, labels_train)

clf = grid.best_estimator_

test_classifier(clf, my_dataset, features_list)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=2, max_leaf_nodes=100, min_impurity_spl...one,
            splitter='best'),
          learning_rate=0.1, n_estimators=150, random_state=42))])
	Accuracy: 0.82233	Precision: 0.32509	Recall: 0.30900	F1: 0.31684	F2: 0.31209
	Total predictions: 15000	True positives:  618	False positives: 1283	False negatives: 1382	True negatives: 11717



# Tuning RandomForestClassifier

In [48]:
pipeline = Pipeline(steps=[('minmaxer', MinMaxScaler(feature_range=(0, 1))),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           ('selection', SelectKBest()),
                           ('classifier', RandomForestClassifier())
                          ])

param_grid = {'minmaxer' : [None, MinMaxScaler(feature_range=(0, 1))],
              'selection__k': [6, 10, 'all'],
              #'reduce_dim__n_components': [6],
              'classifier__n_estimators': [100, 150, 200],
              'classifier__criterion'          : ["gini", "entropy"],
              'classifier__max_depth'          : [5,10],
              'classifier__min_samples_split'  : [3,2] ,
              'classifier__bootstrap'          : [True, False]
             }

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='accuracy')

grid.fit(features_train, labels_train)
clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

KeyboardInterrupt: 

# Tuning LogisticRegression

In [97]:
pipeline = Pipeline(steps=[('minmaxer', MinMaxScaler(feature_range=(0, 1))),
                           #('minmaxer', StandardScaler(with_mean=False)),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           #('selection', SelectKBest()),
                           ('classifier', LogisticRegression(random_state=42))
                          ])

param_grid = {#'minmaxer' : [StandardScaler(with_mean=False), None],
          #'selection__k': [10, 'all'],
          'classifier__C': [0.05, 0.5, 1, 10, 100, 500, 1000],
          'classifier__solver': ['liblinear'],
          'classifier__penalty': ['l2'], 
          'classifier__class_weight': ['balanced']
         }

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='accuracy')

grid.fit(features_train, labels_train)
clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LogisticRegression(C=500, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
	Accuracy: 0.76653	Precision: 0.30195	Recall: 0.57250	F1: 0.39537	F2: 0.48550
	Total predictions: 15000	True positives: 1145	False positives: 2647	False negatives:  855	True negatives: 10353



In [99]:
test_classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None), my_dataset, features_list, folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)
	Accuracy: 0.85053	Precision: 0.42234	Recall: 0.32900	F1: 0.36987	F2: 0.34421
	Total predictions: 15000	True positives:  658	False positives:  900	False negatives: 1342	True negatives: 12100



In [31]:
test_classifier(daBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None), my_dataset, features_list, folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)
	Accuracy: 0.85060	Precision: 0.42261	Recall: 0.32900	F1: 0.36997	F2: 0.34425
	Total predictions: 15000	True positives:  658	False positives:  899	False negatives: 1342	True negatives: 12101



# Tunning parameters

https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml06.pdf
https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/