# Experiments HistGradientBoostingClassifier

## Import modules
   

In [1]:
# import openml
import dabl
import numpy as np
import pandas as pd
import sklearn
from time import time
import openml
import random
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

enable_hist_gradient_boosting

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')

# get all supervised classification tasks 
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')

# set openml cache
# openml.config.cache_directory = os.path.expanduser('/scratch/hp2500/cache')

## Iterate over tasks with successive halving

In [4]:
# randomly pick tasks in infinite loop
while 1:
    
    # sample task 
    i = tasks_all['tid'].sample(1).iloc[0]
    
    # print feedback
    print('Try task', i, '...')
    
    try:
        # measure runtime t0
        t0 = time()

        # get task
        task = openml.tasks.get_task(i)

        # get dataset object 
        data = openml.datasets.get_dataset(task.dataset_id)

        # get relevant info from dataset object
        X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                    target=data.default_target_attribute)
        
        learning_rate = np.logspace(-3,0,4)
        max_iter = list(range(50,501,50))
        max_leaf_nodes = np.exp2(list(range(2,8))).astype(int)
        max_depth = list(range(2,21))
        max_depth.append(None)
        min_samples_leaf = np.linspace(1,50,50).astype(int)
        l2_regularization = np.logspace(-10,10,21)
        max_bins = np.exp2(list(range(1,9))).astype(int)

        # define parameter grid
        param_grid = {'histgradientboostingclassifier__learning_rate': learning_rate,
                      'histgradientboostingclassifier__max_iter': max_iter,
                      'histgradientboostingclassifier__max_leaf_nodes': max_leaf_nodes,
                      'histgradientboostingclassifier__max_depth': max_depth,
                      'histgradientboostingclassifier__min_samples_leaf': min_samples_leaf,
                      'histgradientboostingclassifier__l2_regularization': l2_regularization,
                      'histgradientboostingclassifier__max_bins': max_bins,
                      'histgradientboostingclassifier__validation_fraction': [0.1, 0.2]
                     }        
        
        # define classifier
        clf = HistGradientBoostingClassifier()

        # create pipeline 
        pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), clf)

        # print feedbackack
        print('Run successive halving...')

        # instantiate successive halfing with samples
        sh = dabl.search.RandomSuccessiveHalving(pipe, 
                                                param_distributions=param_grid, 
                                                budget_on='n_samples',
                                                aggressive_elimination = 1)
        

        # fit model 
        sh_fit = sh.fit(X, y)

        # print feedbackack
        print('Create openml run...')

        # instantiate new classifier with best parameters 
        pipe_best = make_pipeline(SimpleImputer(strategy='most_frequent'), clf)
        pipe_best.set_params(**(sh_fit.best_params_))

        # run best model on the task
        run = openml.runs.run_model_on_task(pipe_best, task)

        # print feedbackack
        print('Publish openml run...')

        # publish the run 
        run.publish()

        # measure runtime t1
        t1 = time()

        # print feedback
        print('Runtime:', t1-t0)


    except Exception as e:
        print('An error occurred...')
        print(e)
        continue
    
    # print feedback
    print('View run online: https://www.openml.org/r/' + str(run.run_id))
    print()

Try task 14965 ...
Run successive halving...
Create openml run...
Publish openml run...
Runtime: 157.56405401229858
View run online: https://www.openml.org/r/10245000

Try task 146824 ...
Run successive halving...


KeyboardInterrupt: 