In [1]:
import hashlib
import time
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm

from ml313.auto_ml import get_pipeline
from ml313.auto_ml import get_param_dist

In [2]:
data_iris = load_iris()
X = pd.DataFrame(data_iris['data'])
y = data_iris['target']==1

In [3]:
X = pd.DataFrame(data_iris['data'])
y = data_iris['target']==1

templates = [
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_lr', 'clf_lr'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_lr'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_dt'],
    ]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=313)

df = pd.DataFrame()
for template in tqdm(templates, desc='template', leave=True):
    pipeline = get_pipeline(template)
    for max_features in tqdm([2, 4], desc='max features', leave=False):
        param_dist = get_param_dist(pipeline, max_features=max_features)
        for n_iter in tqdm([10, 30], desc='hyperparameter samples', leave=False):
            model_id = hashlib.md5('{}-{}-{}'.format(template, max_features, n_iter).encode('utf-8')).hexdigest()[:8]
            # train the model
            model = RandomizedSearchCV(pipeline, param_distributions=param_dist, scoring='roc_auc',
                                             n_iter=n_iter, n_jobs=-2, cv=cv, random_state=313, error_score=np.nan, verbose=0)
            start_time = time.time()
            model.fit(X, y)
            auc = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'mean_test_score']
            auc_sd = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'std_test_score']
            # save the results
            exec_time = time.time()-start_time
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            df.loc[model_id, 'pipeline'] = '-'.join(template)
            df.loc[model_id, 'max_features'] = max_features
            df.loc[model_id, 'hparam_samples'] = n_iter
            df.loc[model_id, 'cv'] = cv
            df.loc[model_id, 'auc'] = auc
            df.loc[model_id, 'auc_sd'] = auc_sd
            df.loc[model_id, 'exec_time'] = exec_time
            df.loc[model_id, 'time_stamp'] = current_time

HBox(children=(FloatProgress(value=0.0, description='template', max=3.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…




In [4]:
df

Unnamed: 0,pipeline,max_features,hparam_samples,cv,auc,auc_sd,exec_time,time_stamp
2afdeca0,decorr-standard_scaler-power_transformer-sfm_l...,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,1.518283,2020-04-20 23:04:13
5118a544,decorr-standard_scaler-power_transformer-sfm_l...,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,1.008866,2020-04-20 23:04:14
b995aeb1,decorr-standard_scaler-power_transformer-sfm_l...,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,0.344487,2020-04-20 23:04:14
6c1c205a,decorr-standard_scaler-power_transformer-sfm_l...,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.8265,0.0745,0.808317,2020-04-20 23:04:15
b9671c97,decorr-standard_scaler-power_transformer-sfm_g...,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.77625,0.060604,4.085796,2020-04-20 23:04:19
32bef9e1,decorr-standard_scaler-power_transformer-sfm_g...,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,9.796276,2020-04-20 23:04:29
46895cb3,decorr-standard_scaler-power_transformer-sfm_g...,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.77625,0.060604,2.960023,2020-04-20 23:04:32
1c98fe00,decorr-standard_scaler-power_transformer-sfm_g...,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.79,0.080172,5.567027,2020-04-20 23:04:38
8d8fe09b,decorr-standard_scaler-power_transformer-sfm_g...,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.975,0.026149,4.967484,2020-04-20 23:04:43
3e52ee3e,decorr-standard_scaler-power_transformer-sfm_g...,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=...",0.975,0.026149,9.695922,2020-04-20 23:04:52
