# Best runs on all tasks

## Import modules

In [2]:
import openml
from openml import tasks, flows, runs
import sklearn
from sklearn import feature_selection
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pprint
from collections import OrderedDict, Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
import random
import numpy as np
from datetime import datetime
import sys
import json
from itertools import combinations
import signal

import importlib
import utils.functions_analyze_runs
importlib.reload(utils.functions_analyze_runs)
importlib.reload(sklearn)
from utils.functions_analyze_runs import get_run_info_rf, get_run_info

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')


In [3]:
import sklearn

In [4]:
sklearn.__version__

'0.21.2'

In [5]:
# get all supervised classification tasks
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')
# drop problematic tasks
tasks_all = tasks_all.drop([3573, 146825, 167121, 167124])


## Get OpenML runs for RF flows

In [6]:
# get SVC evals
good_flows = [5804, 8365, 5909, 8918, 6969, 8315, 8351]
evals = openml.evaluations.list_evaluations('area_under_roc_curve',
                                            flow= good_flows, 
                                            task=list(tasks_all.tid),
                                            output_format='dataframe'
                                            )

# rank evaluations
evals['rank'] = evals.groupby('task_id')['value'].rank('first', ascending=False)

# get best evaluations
best_evals = evals.loc[evals['rank'] <= 5]

In [7]:
best_evals.shape

(335, 15)

## Check categorical / numerical / mixed features

In [8]:
# empty list to populate with feature types
types = []

for i in tasks_all.tid:
    print(i, '', end = '')
    
    # get task
    task = openml.tasks.get_task(i)

    # get dataset object 
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)
    
    if not any(categorical_indicator):
        types.append((i, 'numeric'))
    elif all(categorical_indicator):
        types.append((i, 'categorical'))
    else:
        types.append((i, 'mixed'))

cat_num = pd.DataFrame(types, columns = ['tid', 'cat_num'])
cat_num = pd.DataFrame(types, columns=['tid', 'cat_num'])

3 6 11 12 14 15 16 18 22 23 28 29 31 32 37 43 45 49 53 219 2074 2079 3021 3022 3481 3549 3560 3902 3903 3904 3913 3917 3918 7592 9910 9946 9952 9957 9960 9964 9971 9976 9977 9978 9981 9985 10093 10101 14952 14954 14965 14969 14970 125920 125922 146195 146800 146817 146819 146820 146821 146822 146824 167119 167120 167125 167140 167141 

In [9]:
cat_num

Unnamed: 0,tid,cat_num
0,3,categorical
1,6,numeric
2,11,numeric
3,12,numeric
4,14,numeric
...,...,...
63,167119,numeric
64,167120,numeric
65,167125,mixed
66,167140,categorical


In [10]:
# check distribution
cat_num['cat_num'].value_counts()

numeric        45
mixed          15
categorical     8
Name: cat_num, dtype: int64

In [11]:
# check ids of mixed feature tasks
list(cat_num.tid.loc[cat_num.cat_num == 'mixed'])

[23,
 29,
 31,
 219,
 2079,
 3021,
 3022,
 7592,
 9971,
 9977,
 14954,
 14965,
 125920,
 167125,
 167141]

In [12]:
task_ids = cat_num[cat_num.cat_num != 'mixed'].tid

In [13]:
best_evals.run_id.sample(frac=1)

211821    10035743
13697      5772820
142160     9092193
33203      5944902
196962     9199065
            ...   
195799     9196781
7          1860316
208546    10029594
160148     9128553
99943      9001525
Name: run_id, Length: 335, dtype: int64

## Loop over all tasks

In [14]:
# define timeout handler
def handler(signum, frame):
    raise Exception("Timeout!")
    
# Register the signal function handler
signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [1]:
# infinite loop
while 1:

    # randomly sample a task
    i = tasks_all.tid.sample().iloc[0] # this samples from all tasks
    #i = task_ids.sample() # sample from numeric or categorical only
    i = 3481

    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in best_evals.run_id.sample(frac=1):
        
        # set time limit
        signal.alarm(3600)
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            # get params
            params = get_run_info_rf(k)

            # define classifier
            clf = RandomForestClassifier(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=True)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            # run.push_tag('best_models')
            # publish the run
            
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


NameError: name 'tasks_all' is not defined