In [1]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from sklearn import model_selection
from time import time
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
    
# Preprocessing
from sklearn.preprocessing import  MaxAbsScaler, StandardScaler, MinMaxScaler

#Models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression,RANSACRegressor, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier

#Feature selection
from sklearn.decomposition import PCA,RandomizedPCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest,chi2, f_regression

#Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score,classification_report

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV




In [2]:
from sklearn.feature_selection import SelectKBest,chi2, f_classif
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

In [3]:
target_label = 'poi'                
email_features_list = [
    # 'email_address', # remit email address; informational label
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'shared_receipt_with_poi',
    'to_messages',
    ]

financial_features_list = [
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    'exercised_stock_options',
    'expenses',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary'
    #,
    #'total_payments',
    #'total_stock_value',
]
features_list = [target_label] + financial_features_list  
#features_list = [target_label] + financial_features_list  + email_features_list 

In [4]:
with open("P:/Nanodegree/ML/ud120-projects/tools/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

outliers = ["TOTAL", "THE TRAVEL AGENCY IN THE PARK", "LOCKHART EUGENE E", "CHAN RONNIE"]
for outlier in outliers :
    data_dict.pop(outlier, 0)


def update_dict_value(key, items, values, dict_obj):
    index = 0
    for item in items:     
        dict_obj[key][item] = values[index]
        index += 1
    return dict_obj
        

    
data_dict = update_dict_value(
              'BELFER ROBERT',
              ['deferred_income','deferral_payments', 'expenses', 
               'director_fees', 'total_payments', 'exercised_stock_options',
               'restricted_stock','restricted_stock_deferred',
               'total_stock_value'], 
              [-102500,'NaN',3285,102500, 3285,'NaN', 44093,-44093,'NaN'],
              data_dict)


data_dict = update_dict_value(
              'BHATNAGAR SANJAY',
              ['other', 'expenses', 'director_fees', 'total_payments',
               'exercised_stock_options','restricted_stock',
               'restricted_stock_deferred','total_stock_value'],
              ['NaN',137864, 'NaN', 137864, 15456290, 
               2604490, -2604490, 15456290],
               data_dict)


In [5]:
for key in data_dict:
    key_values = data_dict[key]

    total_msg = (data_dict[key]['to_messages'] + 
                 data_dict[key]['from_messages'])
    
    total_poi_msg = (data_dict[key]['from_poi_to_this_person'] +
                     data_dict[key]['from_this_person_to_poi'] + 
                     data_dict[key]['shared_receipt_with_poi'])     
        
    try:
        data_dict[key]['message_poi_ratio'] = (float(total_poi_msg) / 
                                           float(total_msg))
    except:
        data_dict[key]['message_poi_ratio'] = "NaN"
        
    try:
        data_dict[key]['message_others_ratio'] = ((float(total_msg) - float(total_poi_msg)) / 
                                          float(total_msg))
    except:
        data_dict[key]['message_others_ratio'] = "NaN"

features_list = features_list + ['message_poi_ratio','message_others_ratio'] 


In [6]:
#features_list = features_list + email_features_list + ['message_poi_ratio','message_others_ratio'] 
features_list = features_list + ['message_poi_ratio','message_others_ratio'] 
# Store to my_dataset for easy export below.
my_dataset = data_dict

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, remove_NaN=True, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [7]:

# transformed version of X
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(features)

# Split the dataset into train and test
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, 
                                                                                            labels,  
                                                                                            test_size=0.3, 
                                                                                            random_state=42)

Since tje features have quite differente value ranges and some of them are discrete and some of them take continous values, I need to scale them first. Removing mean and dividing the standard deviation o features respectively, this one of the most commonly used preprocessing step.

# Selecting features

For now I don't know exactly the number of features I want to use, so I decided to use PCA for identify the most important features and explain the variance.

In [34]:
for i in range(1,15):
    pca = PCA(svd_solver='auto', n_components=i)
    x = pca.fit(features_train).explained_variance_ratio_.sum()
    print i,":", x    

1 : 0.861553776249
2 : 0.950132015173
3 : 0.972382607444
4 : 0.984291217765
5 : 0.992901638092
6 : 0.997163221659
7 : 0.998629887029
8 : 0.999743067191
9 : 0.99987985018
10 : 0.999974804369
11 : 0.999991386498
12 : 1.0
13 : 1.0
14 : 1.0


In this decomposition, the vector array provided by ratio indicates that most of the information is concentrated into the first 2 compontents . You saw this same sort of result after the factor analysis. it's therefore possible to reduce the entire dataset to just two componentes, providing a reduction of noise and redundant information from the original dataset.

Let's see which variables are the most important:

In [35]:
pca = PCA(n_components=14)
pca.fit_transform(X_train)
pca_df = pd.DataFrame(zip(features_list[1:],
                          np.round(pca.explained_variance_ratio_, decimals=5)*100), 
                      columns=['feature','variance_ratio'])
pca_df.sort_values(by='variance_ratio',ascending = False)

Unnamed: 0,feature,variance_ratio
0,bonus,41.347
1,deferral_payments,22.313
2,deferred_income,12.831
3,director_fees,7.052
4,exercised_stock_options,5.041
5,expenses,4.009
6,loan_advances,2.084
7,long_term_incentive,1.738
8,other,1.27
9,restricted_stock,0.985


I really don't wanna that the features negatively impact my model performance. I'll spend more time on this and try SelecKbest for in this task. 

First of all, I scaled the data with MinMaxScaler (range 0, 1), then I applyied SelectKBest chi squared statistical test, cause this test need non-negative features. After that I used SelectKBest ANOVA F-value statistical test on the raw data, in a way I could compare the results.

In [41]:
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)

selector = SelectKBest(chi2, k='all').fit(features_scaled, labels)
k_best = SelectKBest(f_classif,k='all').fit(features, labels)    

# Format values
kbest_pd = pd.DataFrame(zip(features_list[1:],
                            k_best.scores_, # K best score                            
                            selector.scores_), # chi2
                        columns = ['feature','anova', 'chi2'])
print "SelectKBest: "
print kbest_pd.sort_values(by='anova',ascending = False)

SelectKBest: 
                      feature      anova      chi2
4     exercised_stock_options  22.087532  6.246159
0                       bonus  20.524645  5.048256
11                     salary  18.003740  2.989183
2             deferred_income  11.320185  0.338413
12          message_poi_ratio   9.816852  3.371674
14          message_poi_ratio   9.816852  3.371674
7         long_term_incentive   9.772104  2.497366
9            restricted_stock   8.694888  2.463442
6               loan_advances   7.125382  6.634816
5                    expenses   5.287549  1.293600
8                       other   4.143788  1.703679
3               director_fees   1.972788  1.508957
13       message_others_ratio   1.473822  0.392669
15       message_others_ratio   1.473822  0.392669
10  restricted_stock_deferred   0.767702  0.008756
1           deferral_payments   0.236711  0.094344


# Testing models

In [11]:
models = []
# naive_bayes
models.append(('Naive Bayes', GaussianNB()))
# tree
models.append(('DecisionTree', DecisionTreeClassifier(random_state=42)))
# ensemble
models.append(('RandomForest', RandomForestClassifier(n_estimators=100, random_state=42)))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0)))

models.append(('AdaBoost', AdaBoostClassifier(n_estimators=100, random_state=42)))
models.append(('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=100, random_state=42)))
# linear_model
models.append(('LogisticRegression', LogisticRegression(random_state=42)))
# neighbors
models.append(('KNeighbors', KNeighborsClassifier(n_neighbors=5)))
models.append(('NearestCentroid', NearestCentroid()))

accuracy_model = []
score = []
for name, clf in models:  
    tm = time()
    #print "Start testing classifier:", name

    score = test_classifier(clf, my_dataset, features_list)
    #print score
    accuracy_model.append([name,score[2], score[3], score[1]])
    #print "Successfully completed classifier test."
    #print "Processing time: {0}".format(round(time()-tm, 3))

scores = pd.DataFrame(accuracy_model,
                      columns=('Model', 
                               'Precision', 
                               'Recall',
                               'Accuracy')).sort_values(by='Accuracy',
                                                    ascending = False)

print scores

Start testing classifier: Naive Bayes
Start testing classifier: DecisionTree
Start testing classifier: RandomForest
Start testing classifier: GradientBoostingClassifier
Start testing classifier: AdaBoost
Start testing classifier: ExtraTreesClassifier
Start testing classifier: LogisticRegression
Start testing classifier: KNeighbors
Start testing classifier: NearestCentroid
                        Model  Precision  Recall  Accuracy
7                  KNeighbors   0.705556  0.1270  0.876533
2                RandomForest   0.401084  0.1480  0.856933
4                    AdaBoost   0.418319  0.3060  0.850733
5        ExtraTreesClassifier   0.362011  0.1620  0.850200
8             NearestCentroid   0.376596  0.2655  0.843467
3  GradientBoostingClassifier   0.263037  0.2295  0.811533
1                DecisionTree   0.251683  0.2430  0.802733
6          LogisticRegression   0.164218  0.1900  0.763067
0                 Naive Bayes   0.219135  0.6230  0.653733


As I'm interested in the three metrics recall, precision and accuracy in the test with basic parameters, the Models NearestCentroid and GradientBoostingClassifier had a score close to the expected of 0.3. Even without tuning the AdaBoost reached the metric with a precision of 0.41, recall of 0.30 and a good percentage of acuracia of 85%.

One of the models has reached the goal, but what if we can increase the precision and recall of the other models? Sounds like a good mission, does not it? We, will try.

# Tuning Algorithm Parameters

## 1. GradientBoostingClassifier

In [19]:
pipeline = Pipeline([('scale', MinMaxScaler(feature_range=(0, 1))),
                     ('selection', SelectKBest()),
                     ('classify', GradientBoostingClassifier(random_state=42))])

param_grid = {
        'selection__k': [6],
        'classify__criterion': ['mae', 'friedman_mse', 'mse'],
        'classify__learning_rate':[1.0, 0.1],
        'classify__min_samples_leaf': [3],
        'classify__max_leaf_nodes': [100]
        #'classify__loss' : ['deviance']
    
        #'classify__max_leaf_nodes': [100, 150],
        #'classify__max_features': ['sqrt', 0.50, 0.80] 
        #'classify__subsample': [0.8, 1.0]
        #'classify__loss' : ['exponential']
        
    }
grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1')

grid.fit(features_train, labels_train)
predicted = grid.predict(features_test)
test_classifier(grid.best_estimator_, my_dataset, features_list, print_result=True)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C0835F8>)), ('classify', GradientBoostingClassifier(criterion='mae', init=None, learning_rate=0.1,
              loss='deviance', max_depth=3, max_features=Non...        presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False))])
	Accuracy: 0.84187	Precision: 0.33029	Recall: 0.18100	F1: 0.23385	F2: 0.19899
	Total predictions: 15000	True positives:  362	False positives:  734	False negatives: 1638	True negatives: 12266



(Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C0835F8>)), ('classify', GradientBoostingClassifier(criterion='mae', init=None, learning_rate=0.1,
               loss='deviance', max_depth=3, max_features=Non...        presort='auto', random_state=42, subsample=1.0, verbose=0,
               warm_start=False))]),
 0.8418666666666667,
 0.3302919708029197,
 0.181,
 0.23385012919896642,
 0.1989885664028144)

## 2. NearestCentroid

In [22]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler()),
                           ('reduce_dim', PCA(copy=True, random_state=42)),
                           ('selection', SelectKBest()),
                           ('classifier', NearestCentroid())
                          ])

param_grid = {'minmaxer' : [None, StandardScaler()],
              'selection__k': [6],
              'reduce_dim__n_components': [6],
              'classifier__metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan','correlation', 'minkowski'],
              'classifier__shrink_threshold'  : [None, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,1000.0]
         }
#scv = StratifiedShuffleSplit(labels_train, 1000, random_state = 42)

grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1', n_jobs=-1)

grid.fit(features_train, labels_train)

clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list, print_result=True)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=6, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C0835F8>)), ('classifier', NearestCentroid(metric='euclidean', shrink_threshold=None))])
	Accuracy: 0.80133	Precision: 0.35250	Recall: 0.58550	F1: 0.44006	F2: 0.51713
	Total predictions: 15000	True positives: 1171	False positives: 2151	False negatives:  829	True negatives: 10849



(Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=6, random_state=42,
   svd_solver='auto', tol=0.0, whiten=False)), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C0835F8>)), ('classifier', NearestCentroid(metric='euclidean', shrink_threshold=None))]),
 0.8013333333333333,
 0.3524984948826008,
 0.5855,
 0.4400601277715145,
 0.5171347818406643)

## 3. Last but not least...  AdaBoostClassifier

In [23]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler(with_mean=False)),
                           #('selection', SelectKBest()),
                           #('reduce',PCA(n_components=6, random_state=42)),
                           ('classifier', AdaBoostClassifier(random_state=42))
                               ])
params = {
          #'selection__k': [6],
          'classifier__base_estimator' : [DecisionTreeClassifier(max_features=2, criterion="gini", max_leaf_nodes=100)], 
          'classifier__n_estimators': [150,200],
          'classifier__learning_rate' :[0.1, 1.0],
          'classifier__algorithm' : ['SAMME.R', 'SAMME']
               }

#scv = StratifiedShuffleSplit(features_train, 1000, random_state = 42)

# set up gridsearch
grid = GridSearchCV(pipeline, param_grid = params,scoring = 'accuracy', cv=10)
grid.fit(features_train, labels_train)

clf = grid.best_estimator_

test_classifier(clf, my_dataset, features_list, print_result=True)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=2, max_leaf_nodes=100, min_impurity_spl...one,
            splitter='best'),
          learning_rate=0.1, n_estimators=150, random_state=42))])
	Accuracy: 0.82233	Precision: 0.32509	Recall: 0.30900	F1: 0.31684	F2: 0.31209
	Total predictions: 15000	True positives:  618	False positives: 1283	False negatives: 1382	True negatives: 11717



(Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
           base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=2, max_leaf_nodes=100, min_impurity_spl...one,
             splitter='best'),
           learning_rate=0.1, n_estimators=150, random_state=42))]),
 0.8223333333333334,
 0.3250920568122041,
 0.309,
 0.31684183542681366,
 0.3120896879103121)

## Now I'm a bit curious...
about how much other models can be improved. Let's continue the tests.

To facilitate the tuning task, I preset 3 estimator with a list of transformer objects.

In [55]:
scale_pca_kbest = FeatureUnion([('scale', MinMaxScaler(feature_range=(0, 1))),
                                ("pca", PCA()),
                                ("selection",SelectKBest())
                               ])

scale_kbest = FeatureUnion([('scale', MinMaxScaler(feature_range=(0, 1))),
                            ("selection",SelectKBest())
                           ])

scale_pca = FeatureUnion([('scale', MinMaxScaler(feature_range=(0, 1))),
                          ("pca", PCA())
                         ])

## 4. RandomForestClassifier

In [14]:
pipeline =   Pipeline([('scale', MinMaxScaler(feature_range=(0, 1))),
                       ('select', SelectKBest()),   
                       ('classify', RandomForestClassifier())
                      ])

param_grid = {'scale': [None, MaxAbsScaler()],
              'select__k': [6, 10],
              "classify__max_depth": [5, 3, 1],
              "classify__max_features": [2,1],
              "classify__min_samples_leaf": [1, 3, 10,15],
              "classify__bootstrap": [True, False],
              "classify__criterion": ["gini", "entropy"]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='recall')

grid.fit(features_train, labels_train)

clf =  grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

  f = msb / msw


(Pipeline(steps=[('scale', None), ('select', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C0835F8>)), ('classify', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=5, max_features=2, max_leaf_nodes=None,
             min_impurity_split=1e-07,...imators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False))]),
 0.8510666666666666,
 0.30236486486486486,
 0.0895,
 0.1381172839506173,
 0.10416666666666667)

## 5. KNeighborsClassifier

In [16]:
pipeline = Pipeline(steps=[('minmaxer', StandardScaler(with_mean=False)),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           #('selection', SelectKBest()),
                           ('classifier', KNeighborsClassifier())
                          ])

param_grid = {'minmaxer' : [None, StandardScaler(with_mean=False)],
              #'selection__k': [6],
              #'reduce_dim__n_components': [6],
              #'classifier__metric': ["euclidean", "cityblock", 'minkowski'],
              'classifier__n_neighbors' : [4,6,10,14],
              'classifier__weights' : ['uniform','distance']
             }

grid = GridSearchCV(
    pipeline, param_grid=param_grid, cv=10, scoring='f1')

grid.fit(features_train, labels_train)

clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list, print_result=True)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance'))])
	Accuracy: 0.84100	Precision: 0.31220	Recall: 0.16000	F1: 0.21157	F2: 0.17729
	Total predictions: 15000	True positives:  320	False positives:  705	False negatives: 1680	True negatives: 12295



(Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=4, p=2,
            weights='distance'))]),
 0.841,
 0.3121951219512195,
 0.16,
 0.2115702479338843,
 0.17728531855955681)

Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=6, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('selection', SelectKBest(k=6, score_func=<function f_classif at 0x000000000C695668>)), ('classifier', NearestCentroid(metric='euclidean', shrink_threshold=None))])
	Accuracy: 0.80133	Precision: 0.35250	Recall: 0.58550	F1: 0.44006	F2: 0.51713
	Total predictions: 15000	True positives: 1171	False positives: 2151	False negatives:  829	True negatives: 10849



Pipeline(steps=[('minmaxer', StandardScaler(copy=True, with_mean=False, with_std=True)), ('classifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=2, max_leaf_nodes=100, min_impurity_spl...one,
            splitter='best'),
          learning_rate=0.1, n_estimators=150, random_state=42))])
	Accuracy: 0.82233	Precision: 0.32509	Recall: 0.30900	F1: 0.31684	F2: 0.31209
	Total predictions: 15000	True positives:  618	False positives: 1283	False negatives: 1382	True negatives: 11717



# Tuning RandomForestClassifier

In [48]:
pipeline = Pipeline(steps=[('minmaxer', MinMaxScaler(feature_range=(0, 1))),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           ('selection', SelectKBest()),
                           ('classifier', RandomForestClassifier())
                          ])

param_grid = {'minmaxer' : [None, MinMaxScaler(feature_range=(0, 1))],
              'selection__k': [6, 10, 'all'],
              #'reduce_dim__n_components': [6],
              'classifier__n_estimators': [100, 150, 200],
              'classifier__criterion'          : ["gini", "entropy"],
              'classifier__max_depth'          : [5,10],
              'classifier__min_samples_split'  : [3,2] ,
              'classifier__bootstrap'          : [True, False]
             }

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='accuracy')

grid.fit(features_train, labels_train)
clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list)

KeyboardInterrupt: 

# 6. LogisticRegression

In [18]:
pipeline = Pipeline(steps=[('minmaxer', MinMaxScaler(feature_range=(0, 1))),
                           #('minmaxer', StandardScaler(with_mean=False)),
                           #('reduce_dim', PCA(copy=True, random_state=42)),
                           #('selection', SelectKBest()),
                           ('classifier', LogisticRegression(random_state=42))
                          ])

param_grid = {#'minmaxer' : [StandardScaler(with_mean=False), None],
          #'selection__k': [10, 'all'],
          'classifier__C': [0.05, 0.5, 1, 10, 100, 500, 1000],
          'classifier__solver': ['liblinear'],
          'classifier__penalty': ['l2'], 
          'classifier__class_weight': ['balanced']
         }

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=10, scoring='accuracy')

grid.fit(features_train, labels_train)
clf = grid.best_estimator_
test_classifier(clf, my_dataset, features_list, print_result=True)

Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LogisticRegression(C=500, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
	Accuracy: 0.76653	Precision: 0.30195	Recall: 0.57250	F1: 0.39537	F2: 0.48550
	Total predictions: 15000	True positives: 1145	False positives: 2647	False negatives:  855	True negatives: 10353



(Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LogisticRegression(C=500, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
 0.7665333333333333,
 0.30195147679324896,
 0.5725,
 0.3953729281767956,
 0.48549864314789687)

In [99]:
test_classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None), my_dataset, features_list, folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)
	Accuracy: 0.85053	Precision: 0.42234	Recall: 0.32900	F1: 0.36987	F2: 0.34421
	Total predictions: 15000	True positives:  658	False positives:  900	False negatives: 1342	True negatives: 12100



In [31]:
test_classifier(daBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None), my_dataset, features_list, folds = 1000)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)
	Accuracy: 0.85060	Precision: 0.42261	Recall: 0.32900	F1: 0.36997	F2: 0.34425
	Total predictions: 15000	True positives:  658	False positives:  899	False negatives: 1342	True negatives: 12101



# Tunning parameters

https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml06.pdf
https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/