In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

from utils import graph_overfit

import numpy as np
import pandas as pd
import dill as pkl
import os

In [None]:
X_train = np.load(os.path.join('data', 'X_train_transformed.npy'), allow_pickle=True)
y_train = np.load(os.path.join('data', 'y_train.pkl'), allow_pickle=True)
X_test = np.load(os.path.join('data', 'X_test_transformed.npy'), allow_pickle=True)
y_test = np.load(os.path.join('data', 'y_test.pkl'), allow_pickle=True)

X_folds = np.load(os.path.join('data', 'X_folds_tuple.npy'), allow_pickle=True)
y_folds = np.load(os.path.join('data', 'y_folds_tuple.npy'), allow_pickle=True)
fold_ids = [(np.array(fold.index)) for fold in X_folds]

In [None]:
def custom_cv_folds(fold_ids):
    fold_ids = np.array(fold_ids, dtype=object)
    for n in range(len(fold_ids)):
        all_ids = list(range(len(fold_ids)))
        all_ids.remove(n)
        yield np.concatenate(fold_ids[all_ids]), fold_ids[n]

In [None]:
custom_cv = custom_cv_folds(fold_ids)
ada_clf = AdaBoostClassifier()

ada_grid = {'n_estimators': [100, 500, 1000],
            'learning_rate': [0.001, 0.01, 0.1, 1]}

clf = RandomizedSearchCV(ada_clf, ada_grid, n_jobs=-1, cv=custom_cv, return_train_score=True)
search = clf.fit(X_train, y_train)

In [None]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,...,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,mean_train_score,std_train_score
0,5.352843,0.123897,0.079994,0.014729,200,0.01,"{'n_estimators': 200, 'learning_rate': 0.01}",0.4375,0.42623,0.453488,...,0.460043,0.466761,0.465263,0.470671,0.470546,0.467138,0.467569,0.475867,0.467982,0.004312
1,26.437584,1.121902,0.3656,0.03687,1000,0.001,"{'n_estimators': 1000, 'learning_rate': 0.001}",0.4375,0.415301,0.430233,...,0.454284,0.447666,0.444211,0.455124,0.461207,0.448057,0.45474,0.451735,0.452128,0.005027
2,1.284093,0.049551,0.013251,0.001488,50,0.01,"{'n_estimators': 50, 'learning_rate': 0.01}",0.413462,0.377049,0.406977,...,0.438445,0.445545,0.432281,0.436042,0.451868,0.436042,0.447612,0.42911,0.439618,0.007419
3,1.275668,0.036113,0.016206,0.010238,50,0.001,"{'n_estimators': 50, 'learning_rate': 0.001}",0.307692,0.355191,0.360465,...,0.37581,0.4314,0.407719,0.40636,0.374282,0.44311,0.453314,0.399698,0.411462,0.027346
4,1.28361,0.050313,0.019977,0.011147,50,1.0,"{'n_estimators': 50, 'learning_rate': 1}",0.475962,0.453552,0.453488,...,0.538517,0.548798,0.527719,0.560424,0.5625,0.560424,0.546686,0.541478,0.548318,0.011534
5,26.276761,0.998001,0.340093,0.030653,1000,1.0,"{'n_estimators': 1000, 'learning_rate': 1}",0.543269,0.513661,0.424419,...,0.724982,0.731259,0.727719,0.732862,0.756466,0.762544,0.747684,0.742836,0.740794,0.012967
6,5.101278,0.101098,0.057493,0.016526,200,1.0,"{'n_estimators': 200, 'learning_rate': 1}",0.514423,0.52459,0.552326,...,0.640029,0.652758,0.632982,0.654417,0.673132,0.686219,0.657876,0.628959,0.653297,0.018306
7,2.576093,0.051278,0.029237,0.010031,100,0.1,"{'n_estimators': 100, 'learning_rate': 0.1}",0.485577,0.415301,0.5,...,0.533477,0.536068,0.542456,0.530742,0.532328,0.530742,0.518175,0.521116,0.530638,0.007298
8,2.544835,0.06227,0.037155,0.013622,100,1.0,"{'n_estimators': 100, 'learning_rate': 1}",0.456731,0.486339,0.488372,...,0.597552,0.609618,0.581754,0.590106,0.623563,0.616254,0.610121,0.588235,0.602151,0.013931
9,1.243934,0.128209,0.013899,0.000828,50,0.1,"{'n_estimators': 50, 'learning_rate': 0.1}",0.471154,0.420765,0.476744,...,0.50252,0.501414,0.50807,0.508834,0.503592,0.49258,0.495367,0.494721,0.500887,0.005728


In [None]:
search.best_score_

0.5043617252654662

In [None]:
with open('data/adaboost_search_results.pkl', 'wb') as f:
    pkl.dump(search.cv_results_, f)

In [None]:
custom_cv = custom_cv_folds(fold_ids)
gr_clf = GradientBoostingClassifier()

gr_params = {'learning_rate': [0.001, 0.01, 0.1, 1],
             'n_estimators': [100, 1000, 10000],
             'max_depth': [3, 7, 10],
             'max_features': ['auto', 'sqrt', 'log2'],
             'ccp_alpha': [0, 0.1, 1, 10]}

clf = RandomizedSearchCV(gr_clf, gr_params, n_jobs=-1, cv=custom_cv)
search_gr = clf.fit(X_train, y_train)

In [None]:
model = AdaBoostClassifier()
graph_overfit(X_train, y_train, X_test, y_test, model, {'algorithm': 'SAMME.R'}, param='n_estimators', param_vals=[100, 1000, 10000], scale='log')

In [None]:
model = AdaBoostClassifier()
graph_overfit(X_train, y_train, X_test, y_test, model, {'algorithm': 'SAMME.R'}, param='n_estimators', param_vals=[10, 100, 1000, 1000000], scale='log')

In [None]:
gr_clf.score(X_test, y_test)

0.5401459854014599

In [None]:
search_gr.best_params_

{'n_estimators': 10000,
 'max_features': 'auto',
 'max_depth': 7,
 'learning_rate': 0.1,
 'ccp_alpha': 0}

In [None]:
search_gr.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_estimators', 'param_max_features', 'param_max_depth', 'param_learning_rate', 'param_ccp_alpha', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [None]:
search_gr.cv_results_['params']

[{'n_estimators': 10000,
  'max_features': 'auto',
  'max_depth': 3,
  'learning_rate': 1,
  'ccp_alpha': 10},
 {'n_estimators': 1000,
  'max_features': 'sqrt',
  'max_depth': 7,
  'learning_rate': 0.1,
  'ccp_alpha': 0},
 {'n_estimators': 1000,
  'max_features': 'auto',
  'max_depth': 3,
  'learning_rate': 1,
  'ccp_alpha': 10},
 {'n_estimators': 1000,
  'max_features': 'log2',
  'max_depth': 10,
  'learning_rate': 0.01,
  'ccp_alpha': 0},
 {'n_estimators': 10000,
  'max_features': 'log2',
  'max_depth': 3,
  'learning_rate': 0.001,
  'ccp_alpha': 0.1},
 {'n_estimators': 10000,
  'max_features': 'auto',
  'max_depth': 7,
  'learning_rate': 0.1,
  'ccp_alpha': 0},
 {'n_estimators': 100,
  'max_features': 'auto',
  'max_depth': 10,
  'learning_rate': 1,
  'ccp_alpha': 1},
 {'n_estimators': 100,
  'max_features': 'log2',
  'max_depth': 7,
  'learning_rate': 0.001,
  'ccp_alpha': 10},
 {'n_estimators': 1000,
  'max_features': 'log2',
  'max_depth': 10,
  'learning_rate': 0.1,
  'ccp_alpha

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e2905743-bdaf-45dd-a896-9824e6125426' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>