In [1]:
import hashlib
import time
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm

from ml313.auto_ml import get_pipeline
from ml313.auto_ml import get_param_dist

In [2]:
data_iris = load_iris()
X = pd.DataFrame(data_iris['data'])
y = data_iris['target']==1

In [3]:
X = pd.DataFrame(data_iris['data'])
y = data_iris['target']==1

templates = [
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_lr', 'clf_lr'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_lr'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_dt'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_et'],
    ['decorr', 'standard_scaler', 'power_transformer', 'sfm_gb', 'clf_gb'],
    ]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=313)

df = pd.DataFrame()
for template in tqdm(templates, desc='template', leave=True):
    pipeline = get_pipeline(template)
    for max_features in tqdm([2, 4], desc='max features', leave=False):
        param_dist = get_param_dist(pipeline, max_features=max_features)
        for n_iter in tqdm([10, 30], desc='hyperparameter samples', leave=False):
            model_id = hashlib.md5('{}-{}-{}'.format(template, max_features, n_iter).encode('utf-8')).hexdigest()[:8]
            # train the model
            model = RandomizedSearchCV(pipeline, param_distributions=param_dist, scoring='roc_auc',
                                             n_iter=n_iter, n_jobs=-2, cv=cv, random_state=313, error_score=np.nan, verbose=0)
            start_time = time.time()
            model.fit(X, y)
            auc = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'mean_test_score']
            auc_sd = pd.DataFrame(model.cv_results_).loc[model.best_index_, 'std_test_score']
            # save the results
            exec_time = time.time()-start_time
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            df.loc[model_id, 'pipeline'] = '-'.join(template)
            df.loc[model_id, 'max_features'] = max_features
            df.loc[model_id, 'hparam_samples'] = n_iter
            df.loc[model_id, 'cv'] = cv
            df.loc[model_id, 'auc'] = auc
            df.loc[model_id, 'auc_sd'] = auc_sd
            df.loc[model_id, 'exec_time'] = exec_time
            df.loc[model_id, 'time_stamp'] = current_time

HBox(children=(FloatProgress(value=0.0, description='template', max=5.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='max features', max=2.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='hyperparameter samples', max=2.0, style=ProgressStyle(des…




In [4]:
pd.set_option('max_colwidth', 60)
df.sort_values('auc', ascending=False)

Unnamed: 0,pipeline,max_features,hparam_samples,cv,auc,auc_sd,exec_time,time_stamp
c1c3b095,decorr-standard_scaler-power_transformer-sfm_gb-clf_et,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.99125,0.010323,37.356362,2020-06-06 16:05:33
5a9d01a4,decorr-standard_scaler-power_transformer-sfm_gb-clf_et,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.9905,0.012639,35.84824,2020-06-06 16:06:22
5b4d324a,decorr-standard_scaler-power_transformer-sfm_gb-clf_et,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.99025,0.008097,14.750679,2020-06-06 16:04:56
289ba6c6,decorr-standard_scaler-power_transformer-sfm_gb-clf_et,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.9845,0.014569,13.212121,2020-06-06 16:05:46
4a3afb35,decorr-standard_scaler-power_transformer-sfm_gb-clf_dt,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97575,0.030415,8.56664,2020-06-06 16:04:20
798bf402,decorr-standard_scaler-power_transformer-sfm_gb-clf_dt,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97575,0.030415,20.588879,2020-06-06 16:04:41
3e52ee3e,decorr-standard_scaler-power_transformer-sfm_gb-clf_dt,2.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97525,0.030051,12.325019,2020-06-06 16:04:12
8d8fe09b,decorr-standard_scaler-power_transformer-sfm_gb-clf_dt,2.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97525,0.030051,6.139858,2020-06-06 16:03:59
e328a4a9,decorr-standard_scaler-power_transformer-sfm_gb-clf_gb,4.0,10.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97375,0.020894,9.611211,2020-06-06 16:07:05
3fbd5ca7,decorr-standard_scaler-power_transformer-sfm_gb-clf_gb,4.0,30.0,"RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_...",0.97375,0.020894,23.287587,2020-06-06 16:07:28
