In [1]:
import hashlib
import time
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm

from ml313.auto_ml import get_pipeline
from ml313.auto_ml import get_param_dist

In [2]:
data_iris = load_iris()
X = pd.DataFrame(data_iris['data'])
y = data_iris['target'] == 1

max_model_dims = [2, 4]
hyperparam_samples = [10, 30]

In [3]:
templates = [
    ['decorr', 'standard_scaler', 'sfm_lr', 'clf_lr'],
    ['decorr', 'standard_scaler', 'sfm_gb', 'clf_et'],
    ]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=313)

df = pd.DataFrame()
for template in tqdm(templates, desc='template', leave=True):
    pipeline = get_pipeline(template)
    for max_features in tqdm(max_model_dims, desc='max features', leave=False):
        param_dist = get_param_dist(pipeline, max_features=max_features)
        for n_iter in tqdm([10, 30], desc='hyperparameter samples', leave=False):
            model_id = hashlib.md5('{}-{}-{}'.format(template, max_features, n_iter).encode('utf-8')).hexdigest()[:8]
            # train the model
            model = RandomizedSearchCV(pipeline, param_distributions=param_dist, scoring='roc_auc',
                                             n_iter=n_iter, n_jobs=-2, cv=cv, random_state=313, error_score=np.nan, verbose=0)
            start_time = time.time()
            model.fit(X, y)
            auc = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'mean_test_score']
            auc_sd = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'std_test_score']
            # save the results
            exec_time = time.time()-start_time
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            df.loc[model_id, 'pipeline'] = '-'.join(template)
            df.loc[model_id, 'max_features'] = max_features
            df.loc[model_id, 'hparam_samples'] = n_iter
            df.loc[model_id, 'cv'] = cv
            df.loc[model_id, 'auc'] = auc
            df.loc[model_id, 'auc_sd'] = auc_sd
            df.loc[model_id, 'exec_time'] = exec_time
            df.loc[model_id, 'time_stamp'] = current_time

HBox(children=(HTML(value='template'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='max features'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='hyperparameter samples'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='hyperparameter samples'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='max features'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='hyperparameter samples'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='hyperparameter samples'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [4]:
df.sort_values('auc')

Unnamed: 0,pipeline,max_features,hparam_samples,cv,auc,auc_sd,exec_time,time_stamp
3478cc3f,decorr-standard_scaler-sfm_lr-clf_lr,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,2.797559,2021-01-19 15:40:13
79eb8d03,decorr-standard_scaler-sfm_lr-clf_lr,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,1.295428,2021-01-19 15:40:14
51ba4c31,decorr-standard_scaler-sfm_lr-clf_lr,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,0.455641,2021-01-19 15:40:15
c1801022,decorr-standard_scaler-sfm_lr-clf_lr,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,1.235412,2021-01-19 15:40:16
6edf4d47,decorr-standard_scaler-sfm_gb-clf_et,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.9855,0.013314,13.229991,2021-01-19 15:41:17
2200a66d,decorr-standard_scaler-sfm_gb-clf_et,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.98875,0.008004,13.641399,2021-01-19 15:40:29
db452999,decorr-standard_scaler-sfm_gb-clf_et,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.9905,0.012639,35.225111,2021-01-19 15:41:52
bd9c7d58,decorr-standard_scaler-sfm_gb-clf_et,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.99175,0.009223,33.873405,2021-01-19 15:41:03
