In [14]:
# example of grid searching key hyperparameters for adaboost on a classification dataset
import pickle
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split


In [15]:
df = pd.read_csv('../../data/JMpreprocessed_UnivariateSelecton_<5.csv')

In [16]:
#splitting up data into training/testing
y = df['Winner']
X = df.drop(columns = 'Winner')

X, X_test, y, y_test = train_test_split(X, y, test_size=0.05, random_state=43)

In [4]:
# define the model with default hyperparameters
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [250,350, 500]
grid['learning_rate'] = [0.001, 0.01, 0.1]

In [5]:
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid_search.fit(X, y)

In [6]:
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.688100 using {'learning_rate': 0.1, 'n_estimators': 250}


In [7]:
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.615747 (0.017139) with: {'learning_rate': 0.001, 'n_estimators': 250}
0.622733 (0.014450) with: {'learning_rate': 0.001, 'n_estimators': 350}
0.629816 (0.015568) with: {'learning_rate': 0.001, 'n_estimators': 500}
0.656477 (0.015236) with: {'learning_rate': 0.01, 'n_estimators': 250}
0.661172 (0.015286) with: {'learning_rate': 0.01, 'n_estimators': 350}
0.669164 (0.015565) with: {'learning_rate': 0.01, 'n_estimators': 500}
0.688100 (0.017962) with: {'learning_rate': 0.1, 'n_estimators': 250}
0.687078 (0.017140) with: {'learning_rate': 0.1, 'n_estimators': 350}
0.686267 (0.016713) with: {'learning_rate': 0.1, 'n_estimators': 500}


In [17]:
model = AdaBoostClassifier(n_estimators=250, learning_rate=.1)

In [18]:
#getting validation set best
model.fit(X, y)
model.score(X_test, y_test)

0.702054794520548

In [13]:
with open('./Models/adaboostmarch7', 'wb') as f:
    pickle.dump(model, f)