# Experiments HistGradientBoostingClassifier

## Import modules
   

In [21]:
import openml
import dabl
import numpy as np
import pandas as pd
import sklearn
import io
import sys
import os
import json
import re
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn_extra.fast_kernel import FKCEigenPro
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from time import time
from datetime import datetime

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')

# get all supervised classification tasks 
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')
tasks_all = tasks_all.drop([3573, 146825, 167121, 167124])

# set openml cache
# openml.config.cache_directory = os.path.expanduser('/scratch/hp2500/cache')

In [25]:
print(datetime.now())

2019-08-25 07:19:20.808705


## Iterate over tasks with successive halving

In [None]:
# randomly pick tasks in infinite loop
while 1:
    
    # sample task 
    i = tasks_all['tid'].sample(1).iloc[0]
    
    # print feedback
    print('Try task', i, '...')
    
    try:
        # measure runtime t0
        t0 = time()
        print('Time:', t0)

        # get task
        task = openml.tasks.get_task(i)

        # get dataset object 
        data = openml.datasets.get_dataset(task.dataset_id)

        # get relevant info from dataset object
        X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                    target=data.default_target_attribute)
        
        X = pd.DataFrame(X, columns=attribute_names)
        y = pd.Series(y)
        
        cat = categorical_indicator
        num = [not k for k in categorical_indicator]

        
        numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                            StandardScaler())

        categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                                OneHotEncoder(handle_unknown='ignore'))

        preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, num),
        ('cat', categorical_transformer, cat)])
               
        # define classifier
        clf = SVC()
        
        if not any(categorical_indicator):
            pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
        elif all(categorical_indicator):
            pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
        else:
            pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
        
        # define parameter grid
        param_grid = {'svc__gamma': np.logspace(-10,10,21),
                      'svc__C': np.logspace(-10,10,21),
                      'svc__degree': [2, 3, 4],
                      'svc__kernel': ["linear", "poly", "rbf", "sigmoid"]}

        # print feedbackack
        print('Run successive halving...')
        
        # instantiate successive halfing with samples
        sh = dabl.search.RandomSuccessiveHalving(pipe, 
                                                param_distributions=param_grid,
                                                budget_on='n_samples',
                                                aggressive_elimination = True)

        # fit model 
        sh_fit = sh.fit(X, y)

        # print feedbackack
        print('Create openml run...')

        # instantiate new classifier with best parameters 
        pipe.set_params(**(sh_fit.best_params_))
        
        # run best model on the task
        run = openml.runs.run_model_on_task(pipe, task)

        # print feedbackack
        print('Publish openml run...')

        # publish the run
        run.publish()

        # measure runtime t1
        t1 = time()

        # print feedback
        print('Runtime:', t1-t0)

    except Exception as e:
        print('An error occurred...')
        print(e)
        continue
        
    else:
        # print feedback
        print('View run online: https://www.openml.org/r/' + str(run.run_id))
        print()

Try task 9971 ...
Time: 1566684882.486909
Run successive halving...
