# **Setup**

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [59]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Remove phishing emails, only consider ham and spam:**

In [60]:
df = df[df['label'] != 2]
print(df.shape)

(75419, 95)


In [62]:
df['label'].value_counts()

1    50199
0    25220
Name: label, dtype: int64

In [63]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

In [64]:
feature_list = df_X.columns

In [65]:
df_X.shape

(75419, 94)

**Apply a standard scaler to the full data set:**

In [66]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_X)
df_X = scaler.transform(df_X)
df_X = pd.DataFrame(df_X, columns=feature_list)

# **Hyperparameter Tuning:**

Feature selection is done using PCA. PCA is a feature transformation process that builds new features from projections (linear combinations) of the old ones.

In [67]:
def get_best_params(pipe, param_grid_list):
    
  grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=5, n_jobs=-1)

  # Passes the entire data set (it creates its own train and test set while doing CV)
  grid.fit(df_X, df_Y)

  return grid

**Use just a portion of the data to speed things up:**

In [68]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

In [69]:
for train_index, test_index in sss.split(df_X, df_Y):
    df_X = df_X.iloc[test_index]
    df_Y = df_Y.iloc[test_index]

In [70]:
print(df_X.shape)
print(df_Y.shape)

(15084, 94)
(15084,)


In [71]:
df_Y.value_counts()

1    10040
0     5044
Name: label, dtype: int64

**Random Forest:**

In [72]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("rf", RandomForestClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'rf__n_estimators': [100, 150, 200],
                  'rf__criterion': ['entropy', 'gini'],
                  'rf__min_samples_split': [2, 3, 4],
                  'rf__min_samples_leaf': [1, 2, 3],
                  'rf__max_features': ['auto', 'sqrt', 'log2']}

grid = get_best_params(pipe, param_grid_list)
rf_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
rf_df[rf_df['rank_test_score'] <= 5].head(5)

{'reduce_dims__n_components': 40, 'rf__criterion': 'entropy', 'rf__max_features': 'auto', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('rf', RandomForestClassifier(criterion='entropy'))])
Wall time: 19min 11s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reduce_dims__n_components,param_rf__criterion,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,9.493832,0.197424,0.066022,0.003958,40,entropy,auto,1,2,100,"{'reduce_dims__n_components': 40, 'rf__criteri...",0.99768,0.995691,0.995691,0.997348,0.996353,0.996553,0.000828,1
27,8.480269,0.149338,0.05625,0.002326,40,entropy,sqrt,1,2,100,"{'reduce_dims__n_components': 40, 'rf__criteri...",0.998343,0.994697,0.99536,0.997017,0.996021,0.996287,0.001282,3
28,12.526362,0.073442,0.077992,0.001323,40,entropy,sqrt,1,2,150,"{'reduce_dims__n_components': 40, 'rf__criteri...",0.998343,0.995028,0.994697,0.997348,0.996021,0.996287,0.001381,3
33,8.423822,0.136497,0.056753,0.002155,40,entropy,sqrt,1,4,100,"{'reduce_dims__n_components': 40, 'rf__criteri...",0.997348,0.99536,0.995691,0.997017,0.996021,0.996287,0.000767,3
60,7.520952,0.186798,0.056355,0.000997,40,entropy,log2,1,4,100,"{'reduce_dims__n_components': 40, 'rf__criteri...",0.997348,0.99536,0.996023,0.997348,0.996021,0.99642,0.000796,2


**MLP:**

In [73]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("mlp", MLPClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
                   'mlp__activation': ['tanh', 'relu'],
                   'mlp__learning_rate': ['constant', 'adaptive'],
                   'mlp__solver': ['adam', 'sgd'],
                   'mlp__alpha': [0.0001, 0.001, 0.01]}

grid = get_best_params(pipe, param_grid_list)
res = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
res[res['rank_test_score'] <= 5].head(5)

{'mlp__activation': 'tanh', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (20,), 'mlp__learning_rate': 'constant', 'mlp__solver': 'adam', 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('mlp',
                 MLPClassifier(activation='tanh', alpha=0.01,
                               hidden_layer_sizes=(20,)))])
Wall time: 9min 7s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__solver,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,6.918882,0.52217,0.015558,0.000797,tanh,0.0001,"(40,)",adaptive,adam,40,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",0.996354,0.996023,0.997348,0.99768,0.996021,0.996685,0.000696,3
24,6.625355,0.498495,0.015358,0.000796,tanh,0.001,"(40,)",constant,adam,40,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",0.996685,0.996354,0.997017,0.997348,0.996353,0.996751,0.000387,2
32,5.771469,0.452068,0.012779,0.000407,tanh,0.01,"(20,)",constant,adam,40,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.997017,0.996685,0.996354,0.997348,0.996353,0.996751,0.000387,1
40,7.111079,0.444515,0.014561,0.000488,tanh,0.01,"(40,)",constant,adam,40,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.997348,0.99536,0.997348,0.996685,0.996353,0.996619,0.000738,4
42,6.141589,0.724573,0.015757,0.001322,tanh,0.01,"(40,)",adaptive,adam,40,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.997017,0.995691,0.997017,0.996685,0.996353,0.996553,0.000496,5


**Logistic Regression:**

In [74]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("lr", LogisticRegression())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'lr__max_iter': [500],
                  'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'lr__fit_intercept': [True, False],
                  'lr__tol': [0.0001, 0.001],
                  'lr__penalty': ['l1', 'l2', 'elasticnet'],
                  'lr__C': [0.1, 1, 10]}

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
lr_df = pd.DataFrame(grid.cv_results_)
lr_df[(lr_df['rank_test_score'] < 5)].head(5)

 0.95246624 0.95220108 0.95419001 0.95392482 0.95399109 0.9539248
 0.95399111 0.9539248  0.95392482 0.95399109        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.95259882 0.95246624 0.95392482 0.95412378 0.9540574  0.95379224
 0.95399109 0.95385851 0.9539248  0.95392482        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.95518468 0.9549194  0.95491938 0.95485309 0.95511834 0.9547868
 0.95505196 0.95491938 0.95498567 0.95485309        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.95465415 0.95538344 0.95491938 0.95491938 0.9547868  0.9547868
 0.95485309 0.95485309 0.95491938 0.95478678        nan        nan
        nan        nan        nan        nan        nan        na

{'lr__C': 1, 'lr__fit_intercept': False, 'lr__max_iter': 500, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'lr__tol': 0.001, 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('lr',
                 LogisticRegression(C=1, fit_intercept=False, max_iter=500,
                                    solver='saga', tol=0.001))])
Wall time: 2min 39s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,param_lr__fit_intercept,param_lr__max_iter,param_lr__penalty,param_lr__solver,param_lr__tol,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
79,0.898731,0.21355,0.011569,0.001954,0.1,False,500,l1,saga,0.001,40,"{'lr__C': 0.1, 'lr__fit_intercept': False, 'lr...",0.94995,0.955585,0.954591,0.957574,0.959218,0.955383,0.003152,2
178,0.851902,0.070288,0.01875,0.005367,1.0,False,500,l2,lbfgs,0.0001,40,"{'lr__C': 1, 'lr__fit_intercept': False, 'lr__...",0.948956,0.955916,0.954922,0.958568,0.958554,0.955383,0.003522,3
183,1.428609,0.237449,0.013165,0.007501,1.0,False,500,l2,saga,0.001,40,"{'lr__C': 1, 'lr__fit_intercept': False, 'lr__...",0.949619,0.956248,0.954922,0.958568,0.958554,0.955582,0.003292,1
204,1.558232,0.16234,0.014364,0.003711,10.0,True,100,l2,sag,0.0001,40,"{'lr__C': 10, 'lr__fit_intercept': True, 'lr__...",0.950945,0.955585,0.954259,0.957574,0.958554,0.955383,0.002678,4


**SVM:**

In [75]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("svc", SVC())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'svc__C': [0.1, 1, 10],
                  'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'svc__degree': [3, 4, 5],
                  'svc__tol': [0.001, 0.0001, 0.01]}

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
svm_df = pd.DataFrame(grid.cv_results_)
svm_df[(svm_df['rank_test_score'] < 5)].head(5)

{'reduce_dims__n_components': 40, 'svc__C': 10, 'svc__degree': 4, 'svc__kernel': 'rbf', 'svc__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('svc', SVC(C=10, degree=4))])
Wall time: 6min 33s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reduce_dims__n_components,param_svc__C,param_svc__degree,param_svc__kernel,param_svc__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
78,1.470495,0.070018,0.395293,0.018982,40,10,3,rbf,0.001,"{'reduce_dims__n_components': 40, 'svc__C': 10...",0.995028,0.993702,0.995028,0.997017,0.994695,0.995094,0.001077,3
79,1.494684,0.057404,0.395252,0.008569,40,10,3,rbf,0.0001,"{'reduce_dims__n_components': 40, 'svc__C': 10...",0.995028,0.993702,0.995028,0.997017,0.994695,0.995094,0.001077,3
80,1.50929,0.04485,0.378294,0.009862,40,10,3,rbf,0.01,"{'reduce_dims__n_components': 40, 'svc__C': 10...",0.995028,0.993702,0.995028,0.997017,0.994695,0.995094,0.001077,3
90,1.480424,0.057797,0.394944,0.030597,40,10,4,rbf,0.001,"{'reduce_dims__n_components': 40, 'svc__C': 10...",0.995028,0.993702,0.99536,0.997017,0.994695,0.99516,0.001081,1
91,1.465797,0.045985,0.400036,0.027837,40,10,4,rbf,0.0001,"{'reduce_dims__n_components': 40, 'svc__C': 10...",0.995028,0.993702,0.99536,0.997017,0.994695,0.99516,0.001081,1


**KNN:**

In [76]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("knn", KNeighborsClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'knn__n_neighbors': [1, 2, 5, 10, 20],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__p': [1, 2],
                  'knn__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                  'knn__leaf_size': [15, 30, 45]}

grid = get_best_params(pipe, param_grid_list)
knn_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
knn_df[(knn_df['rank_test_score'] < 5)].head(5)

{'knn__algorithm': 'ball_tree', 'knn__leaf_size': 30, 'knn__n_neighbors': 1, 'knn__p': 1, 'knn__weights': 'uniform', 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('knn',
                 KNeighborsClassifier(algorithm='ball_tree', n_neighbors=1,
                                      p=1))])
Wall time: 6min 26s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__n_neighbors,param_knn__p,param_knn__weights,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,0.497887,0.016867,4.92535,0.167815,ball_tree,30,1,1,uniform,40,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.997017,0.994365,0.996354,0.997017,0.998342,0.996619,0.001299,1
21,0.499345,0.015271,4.81777,0.185845,ball_tree,30,1,1,distance,40,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.996685,0.994365,0.996354,0.997017,0.998342,0.996553,0.001285,3
25,0.496105,0.01703,5.038324,0.168992,ball_tree,30,2,1,distance,40,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.996685,0.994365,0.996354,0.997017,0.998342,0.996553,0.001285,3
41,0.494241,0.013142,4.808166,0.187206,ball_tree,45,1,1,distance,40,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.996685,0.994365,0.996354,0.997017,0.998342,0.996553,0.001285,3
45,0.502106,0.016983,5.037473,0.161937,ball_tree,45,2,1,distance,40,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.996685,0.994365,0.996354,0.997017,0.998342,0.996553,0.001285,3


**Decision Tree:**

In [77]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("dt", DecisionTreeClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'dt__criterion': ['entropy', 'gini'],
                  'dt__min_samples_split': [2, 3, 4],
                  'dt__min_samples_leaf': [1, 2, 3],
                  'dt__ccp_alpha': [0, 0.005, 0.01, 0.025, 0.05, 0.1]}

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
dt_df = pd.DataFrame(grid.cv_results_)
dt_df[dt_df['rank_test_score'] <= 5].head(5)

{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy', 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('dt',
                 DecisionTreeClassifier(ccp_alpha=0, criterion='entropy'))])
Wall time: 1min 6s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__ccp_alpha,param_dt__criterion,param_dt__min_samples_leaf,param_dt__min_samples_split,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.178051,0.018695,0.010772,0.002129,0,entropy,1,2,40,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.993039,0.989062,0.991051,0.990719,0.990385,0.990851,0.001286,1
1,1.160915,0.034575,0.010772,0.002309,0,entropy,1,3,40,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.989062,0.988399,0.989725,0.992708,0.990053,0.989989,0.001473,3
2,1.14339,0.019874,0.012567,0.002646,0,entropy,1,4,40,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.992377,0.989393,0.990388,0.992045,0.989721,0.990785,0.001212,2
3,1.094682,0.023315,0.009376,0.001352,0,entropy,2,2,40,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.992045,0.987073,0.990388,0.989393,0.987732,0.989326,0.001797,4
11,1.068743,0.200625,0.010373,0.001017,0,gini,1,4,40,"{'dt__ccp_alpha': 0, 'dt__criterion': 'gini', ...",0.990719,0.987736,0.991714,0.991714,0.984748,0.989326,0.002714,5


**Naive Bayes (Gaussian):**

In [78]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("gnb", GaussianNB())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'gnb__var_smoothing': [1E-9, 1E-10, 1E-8]}

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
gnb_df = pd.DataFrame(grid.cv_results_)
gnb_df[gnb_df['rank_test_score'] <= 5].head(5)

{'gnb__var_smoothing': 1e-08, 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('gnb', GaussianNB(var_smoothing=1e-08))])
Wall time: 1.57 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.59102,0.00687,0.016755,0.001163,1e-09,40,"{'gnb__var_smoothing': 1e-09, 'reduce_dims__n_...",0.876036,0.87471,0.880013,0.876699,0.875,0.876492,0.0019,2
1,0.573466,0.020167,0.017952,0.003154,1e-10,40,"{'gnb__var_smoothing': 1e-10, 'reduce_dims__n_...",0.875704,0.875373,0.880013,0.876699,0.874668,0.876492,0.001878,3
2,0.553918,0.007068,0.01137,0.003128,1e-08,40,"{'gnb__var_smoothing': 1e-08, 'reduce_dims__n_...",0.876036,0.875041,0.879682,0.87703,0.875,0.876558,0.001731,1


**AdaBoost:**

In [79]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("ab", AdaBoostClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40], 
                  'ab__n_estimators': [50, 100, 150, 200],
                  'ab__learning_rate': [0.95, 1, 1.05, 1.25, 1.5, 1.75, 2],
                  'ab__algorithm': ['SAMME', 'SAMME.R']}

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
ab_df = pd.DataFrame(grid.cv_results_)
ab_df[ab_df['rank_test_score'] <= 5].head(5)

{'ab__algorithm': 'SAMME.R', 'ab__learning_rate': 1.5, 'ab__n_estimators': 200, 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('ab',
                 AdaBoostClassifier(learning_rate=1.5, n_estimators=200))])
Wall time: 6min 34s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ab__algorithm,param_ab__learning_rate,param_ab__n_estimators,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
31,16.931281,0.069902,0.203656,0.010159,SAMME.R,0.95,200,40,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.989393,0.990056,0.991051,0.990719,0.9937,0.990984,0.001473,3
35,16.977513,0.096812,0.206449,0.018118,SAMME.R,1.0,200,40,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.989062,0.988731,0.991051,0.992377,0.993369,0.990918,0.00181,4
39,16.877358,0.08095,0.196779,0.002273,SAMME.R,1.05,200,40,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.990056,0.989725,0.990388,0.991714,0.992042,0.990785,0.000923,5
43,16.945272,0.071996,0.205051,0.0085,SAMME.R,1.25,200,40,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.992708,0.990719,0.991714,0.994697,0.991379,0.992243,0.001385,2
47,16.945281,0.111864,0.207045,0.019042,SAMME.R,1.5,200,40,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.992708,0.992045,0.992377,0.992708,0.992706,0.992509,0.000265,1


**GradientBoostingClassifier:**

In [80]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA()),
                ("gbc", GradientBoostingClassifier())
                ])

param_grid_list = {'reduce_dims__n_components': [40],
                   'gbc__max_features': ['auto', 'sqrt', 'log2'],
                   'gbc__learning_rate': [0.05, 0.1, 0.2, 0.25, 0.30, 0.35, 0.40, 0.5, 0.6, 0.7, 0.9],
                   'gbc__n_estimators': [100, 200]
                  }

grid = get_best_params(pipe, param_grid_list)
print(grid.best_params_, '\n')
print(grid.best_estimator_)
gbc_df = pd.DataFrame(grid.cv_results_)
gbc_df[gbc_df['rank_test_score'] <= 5].head(5)

{'gbc__learning_rate': 0.4, 'gbc__max_features': 'log2', 'gbc__n_estimators': 200, 'reduce_dims__n_components': 40} 

Pipeline(steps=[('scale', StandardScaler()),
                ('reduce_dims', PCA(n_components=40)),
                ('gbc',
                 GradientBoostingClassifier(learning_rate=0.4,
                                            max_features='log2',
                                            n_estimators=200))])
Wall time: 9min 27s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__learning_rate,param_gbc__max_features,param_gbc__n_estimators,param_reduce_dims__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,6.970852,0.254637,0.023737,0.005101,0.3,sqrt,200,40,"{'gbc__learning_rate': 0.3, 'gbc__max_features...",0.996354,0.995028,0.995691,0.998011,0.995358,0.996089,0.001057,2
29,6.116166,0.174998,0.023937,0.0026,0.3,log2,200,40,"{'gbc__learning_rate': 0.3, 'gbc__max_features...",0.996023,0.99536,0.996023,0.996354,0.995358,0.995823,0.000398,5
41,6.745234,0.450648,0.024336,0.003,0.4,log2,200,40,"{'gbc__learning_rate': 0.4, 'gbc__max_features...",0.996685,0.99536,0.995691,0.997348,0.996021,0.996221,0.000714,1
45,6.831288,0.056587,0.021753,0.000735,0.5,sqrt,200,40,"{'gbc__learning_rate': 0.5, 'gbc__max_features...",0.994697,0.993702,0.996354,0.998674,0.99569,0.995823,0.001685,4
53,5.883035,0.023411,0.021343,0.000489,0.6,log2,200,40,"{'gbc__learning_rate': 0.6, 'gbc__max_features...",0.996023,0.99536,0.995691,0.996354,0.996021,0.99589,0.000338,3
