# Best runs on all tasks

## Import modules

In [1]:
# import modules
import openml
from openml import tasks, flows, runs
import sklearn
from sklearn import feature_selection
from sklearn.svm import SVC
import pandas as pd
import pprint
from collections import OrderedDict, Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
import random
import numpy as np
from datetime import datetime
import sys
import matplotlib.pyplot as plt
import json
from itertools import combinations
from utils.functions_analyze_runs import get_run_info_svc

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')


  return f(*args, **kwds)


In [102]:
# get all supervised classification tasks
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')
# drop problematic tasks
tasks_all = tasks_all.drop([3573, 146825, 167121, 167124])


## Get OpenML runs for SVC flows

In [80]:
# get SVC evals
good_flows = [6246, 6952, 8330, 6954, 7756, 5499, 8317, 7223, 6009, 7707, 6269, 5983, 16374, 16347, 16345]
evals = openml.evaluations.list_evaluations('area_under_roc_curve',
                                            flow= good_flows, 
                                            task=list(tasks_all.tid),
                                            output_format='dataframe'
                                            )

# rank evaluations
evals['rank'] = evals.groupby('task_id')['value'].rank('first', ascending=False)

# get best evaluations
best_evals = evals.loc[evals['rank'] <= 5]

In [83]:
# drop problematic runs
best_evals = best_evals[best_evals.run_id != 6148258]
best_evals = best_evals[best_evals.run_id != 8231647]

In [103]:
best_evals.shape

(336, 13)

## Check categorical / numerical / mixed features

In [85]:
# empty list to populate with feature types
types = []

for i in tasks_all.tid:
    print(i, '', end = '')
    
    # get task
    task = openml.tasks.get_task(i)

    # get dataset object 
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    if not any(categorical_indicator):
        types.append((i, 'numeric'))
    elif all(categorical_indicator):
        types.append((i, 'categorical'))
    else:
        types.append((i, 'mixed'))

cat_num = pd.DataFrame(types, columns = ['tid', 'cat_num'])
cat_num = pd.DataFrame(types, columns=['tid', 'cat_num'])

3 6 11 12 14 15 16 18 22 23 28 29 31 32 37 43 45 49 53 219 2074 2079 3021 3022 3481 3549 3560 3902 3903 3904 3913 3917 3918 7592 9910 9946 9952 9957 9960 9964 9971 9976 9977 9978 9981 9985 10093 10101 14952 14954 14965 14969 14970 125920 125922 146195 146800 146817 146819 146820 146821 146822 146824 167119 167120 167125 167140 167141 

In [109]:
cat_num

Unnamed: 0,tid,cat_num
0,3,categorical
1,6,numeric
2,11,numeric
3,12,numeric
4,14,numeric
5,15,numeric
6,16,numeric
7,18,numeric
8,22,numeric
9,23,mixed


In [86]:
# check distribution
cat_num['cat_num'].value_counts()


numeric        45
mixed          15
categorical     8
Name: cat_num, dtype: int64

In [89]:
# check ids of mixed feature tasks
list(cat_num.tid.loc[cat_num.cat_num == 'mixed'])

[23,
 29,
 31,
 219,
 2079,
 3021,
 3022,
 7592,
 9971,
 9977,
 14954,
 14965,
 125920,
 167125,
 167141]

## Loop over all tasks

In [121]:
# infinite loop
while 1:

    # randomly sample a task
    i = tasks_all.tid.sample().iloc[0]
    
    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object 
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in best_evals.run_id.sample(frac=1):
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            # get params
            params = get_run_info_svc(k)

            # define classifier
            clf = SVC(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=True)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            #run.push_tag('best_models')

            # publish the run 
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


Run 5939306 on task 49
2019-09-02 16:41:03.355763
One or more runs of this setup were already performed on the task.
Run 8726998 on task 49
2019-09-02 16:41:06.244442
Publish openml run...
View run online: https://www.openml.org/r/10397846
Setup 8235794
Flow 16366

Run 6005547 on task 49
2019-09-02 16:41:12.333071
Publish openml run...
View run online: https://www.openml.org/r/10397847
Setup 8235939
Flow 16366

Run 6060855 on task 49
2019-09-02 16:41:18.588755
Publish openml run...
View run online: https://www.openml.org/r/10397848
Setup 8235853
Flow 16366

Run 8983496 on task 49
2019-09-02 16:41:24.475719
Publish openml run...
View run online: https://www.openml.org/r/10397849
Setup 8235952
Flow 16366

Run 9202797 on task 49
2019-09-02 16:41:31.049296
Publish openml run...
View run online: https://www.openml.org/r/10397850
Setup 8235850
Flow 16366

Run 6046419 on task 49
2019-09-02 16:41:38.544964
One or more runs of this setup were already performed on the task.
Run 9202653 on task 4

Publish openml run...
View run online: https://www.openml.org/r/10397896
Setup 8235662
Flow 16366

Run 9200440 on task 49
2019-09-02 16:46:59.008069
name 'auto' is not defined
Run 8848691 on task 49
2019-09-02 16:46:59.049932
Publish openml run...
View run online: https://www.openml.org/r/10397897
Setup 8235722
Flow 16366

Run 8843257 on task 49
2019-09-02 16:47:05.492419
Publish openml run...
View run online: https://www.openml.org/r/10397898
Setup 8235765
Flow 16366

Run 9202994 on task 49
2019-09-02 16:47:16.103735
Publish openml run...
View run online: https://www.openml.org/r/10397899
Setup 8235698
Flow 16366

Run 9201957 on task 49
2019-09-02 16:47:23.369506
Publish openml run...
View run online: https://www.openml.org/r/10397900
Setup 8235878
Flow 16366

Run 8812964 on task 49
2019-09-02 16:47:29.850710
One or more runs of this setup were already performed on the task.
Run 8697848 on task 49
2019-09-02 16:47:33.406947
Publish openml run...
View run online: https://www.openml.org

Publish openml run...
View run online: https://www.openml.org/r/10397945
Setup 8235656
Flow 16366

Run 9202708 on task 49
2019-09-02 16:53:01.527852
Publish openml run...
View run online: https://www.openml.org/r/10397946
Setup 8235817
Flow 16366

Run 8816158 on task 49
2019-09-02 16:53:08.724594
Publish openml run...
View run online: https://www.openml.org/r/10397947
Setup 8235973
Flow 16366

Run 8813676 on task 49
2019-09-02 16:53:15.558937
Publish openml run...
View run online: https://www.openml.org/r/10397948
Setup 8235826
Flow 16366

Run 6220541 on task 49
2019-09-02 16:53:21.875623
One or more runs of this setup were already performed on the task.
Run 5257991 on task 49
2019-09-02 16:53:24.693092
Publish openml run...
View run online: https://www.openml.org/r/10397949
Setup 8235959
Flow 16366

Run 9200901 on task 49
2019-09-02 16:53:31.696388
name 'auto' is not defined
Run 8814784 on task 49
2019-09-02 16:53:31.725063
Publish openml run...
View run online: https://www.openml.org

Publish openml run...
View run online: https://www.openml.org/r/10397993
Setup 8235977
Flow 16366

Run 5945675 on task 49
2019-09-02 16:59:05.835881
Publish openml run...
View run online: https://www.openml.org/r/10397994
Setup 8235834
Flow 16366

Run 7346175 on task 49
2019-09-02 16:59:11.830889
One or more runs of this setup were already performed on the task.
Run 8741229 on task 49
2019-09-02 16:59:14.898387
Publish openml run...
View run online: https://www.openml.org/r/10397995
Setup 8235944
Flow 16366

Run 8739373 on task 49
2019-09-02 16:59:20.597990
Publish openml run...
View run online: https://www.openml.org/r/10397996
Setup 8235704
Flow 16366

Run 10369671 on task 49
2019-09-02 16:59:26.887896
Publish openml run...
View run online: https://www.openml.org/r/10397997
Setup 8235945
Flow 16366

Run 9202392 on task 49
2019-09-02 16:59:35.671761
Publish openml run...
View run online: https://www.openml.org/r/10397998
Setup 8235861
Flow 16366

Run 8839285 on task 49
2019-09-02 16:5

View run online: https://www.openml.org/r/10398035
Setup 8235689
Flow 16366

Run 5942023 on task 49
2019-09-02 17:04:24.368480
Publish openml run...
View run online: https://www.openml.org/r/10398036
Setup 8235762
Flow 16366

Run 6055588 on task 49
2019-09-02 17:04:30.389322
Publish openml run...
View run online: https://www.openml.org/r/10398037
Setup 8235927
Flow 16366

Run 9202483 on task 49
2019-09-02 17:05:01.430777
Publish openml run...
View run online: https://www.openml.org/r/10398038
Setup 8235711
Flow 16366

Run 7451303 on task 49
2019-09-02 17:05:09.807810
Publish openml run...
View run online: https://www.openml.org/r/10398039
Setup 8235883
Flow 16366

Run 8983669 on task 49
2019-09-02 17:05:15.867436
One or more runs of this setup were already performed on the task.
Run 5929020 on task 49
2019-09-02 17:05:20.164311
Publish openml run...
View run online: https://www.openml.org/r/10398040
Setup 8235682
Flow 16366

Run 6055928 on task 49
2019-09-02 17:05:26.003966
Publish ope

View run online: https://www.openml.org/r/10398083
Setup 8235962
Flow 16366

Run 5351969 on task 49
2019-09-02 17:10:59.601071
Publish openml run...
View run online: https://www.openml.org/r/10398084
Setup 8235868
Flow 16366

Run 9202394 on task 49
2019-09-02 17:11:06.045697
Publish openml run...
View run online: https://www.openml.org/r/10398085
Setup 8235699
Flow 16366

Run 8705233 on task 49
2019-09-02 17:11:12.865164
One or more runs of this setup were already performed on the task.
Run 6176713 on task 49
2019-09-02 17:11:15.899924
Publish openml run...
View run online: https://www.openml.org/r/10398086
Setup 8235882
Flow 16366

Run 8699932 on task 49
2019-09-02 17:11:22.163142
Publish openml run...
View run online: https://www.openml.org/r/10398087
Setup 8235657
Flow 16366

Run 6045454 on task 49
2019-09-02 17:11:28.399866
Publish openml run...
View run online: https://www.openml.org/r/10398088
Setup 8235824
Flow 16366

Run 7290550 on task 49
2019-09-02 17:11:35.448903
Publish ope

One or more runs of this setup were already performed on the task.
Run 6038418 on task 49
2019-09-02 17:16:32.183990
One or more runs of this setup were already performed on the task.
Run 8820977 on task 49
2019-09-02 17:16:35.361645
One or more runs of this setup were already performed on the task.
Run 9202392 on task 49
2019-09-02 17:16:39.356873
One or more runs of this setup were already performed on the task.
Run 8734854 on task 49
2019-09-02 17:16:42.460330
One or more runs of this setup were already performed on the task.
Run 6054213 on task 49
2019-09-02 17:16:45.838093
One or more runs of this setup were already performed on the task.
Run 9202485 on task 49
2019-09-02 17:16:48.909646
One or more runs of this setup were already performed on the task.
Run 9202389 on task 49
2019-09-02 17:16:52.293030
One or more runs of this setup were already performed on the task.
Run 8729296 on task 49
2019-09-02 17:16:55.368235
One or more runs of this setup were already performed on the tas

One or more runs of this setup were already performed on the task.
Run 7328806 on task 49
2019-09-02 17:20:14.530325
One or more runs of this setup were already performed on the task.
Run 9202452 on task 49
2019-09-02 17:20:17.641449
One or more runs of this setup were already performed on the task.
Run 8705233 on task 49
2019-09-02 17:20:20.410106
One or more runs of this setup were already performed on the task.
Run 5947544 on task 49
2019-09-02 17:20:23.338389
One or more runs of this setup were already performed on the task.
Run 7290550 on task 49
2019-09-02 17:20:27.331373
One or more runs of this setup were already performed on the task.
Run 8745470 on task 49
2019-09-02 17:20:29.650263
One or more runs of this setup were already performed on the task.
Run 8850476 on task 49
2019-09-02 17:20:32.422211
One or more runs of this setup were already performed on the task.
Run 8984144 on task 49
2019-09-02 17:20:35.318991
One or more runs of this setup were already performed on the tas

One or more runs of this setup were already performed on the task.
Run 6045454 on task 49
2019-09-02 17:23:52.849912
One or more runs of this setup were already performed on the task.
Run 6057941 on task 49
2019-09-02 17:23:55.609572
One or more runs of this setup were already performed on the task.
Run 6047445 on task 49
2019-09-02 17:23:58.137516
One or more runs of this setup were already performed on the task.
Run 8821242 on task 49
2019-09-02 17:24:00.412655
One or more runs of this setup were already performed on the task.
Run 9201842 on task 49
2019-09-02 17:24:02.914499
One or more runs of this setup were already performed on the task.
Run 9202044 on task 49
2019-09-02 17:24:06.362498
One or more runs of this setup were already performed on the task.
Run 9202890 on task 49
2019-09-02 17:24:09.128303
One or more runs of this setup were already performed on the task.
Run 9201923 on task 49
2019-09-02 17:24:11.892762
One or more runs of this setup were already performed on the tas

One or more runs of this setup were already performed on the task.
Run 10376362 on task 49
2019-09-02 17:27:18.972430
One or more runs of this setup were already performed on the task.
Run 9202983 on task 49
2019-09-02 17:27:22.661200
One or more runs of this setup were already performed on the task.
Run 8737515 on task 49
2019-09-02 17:27:26.468776
One or more runs of this setup were already performed on the task.
Run 6058985 on task 49
2019-09-02 17:27:29.729271
One or more runs of this setup were already performed on the task.
Run 8697887 on task 49
2019-09-02 17:27:32.795933
One or more runs of this setup were already performed on the task.
Run 9202483 on task 49
2019-09-02 17:27:35.255723
One or more runs of this setup were already performed on the task.
Run 8837975 on task 49
2019-09-02 17:27:38.330395
One or more runs of this setup were already performed on the task.
Run 8741229 on task 49
2019-09-02 17:27:41.401021
One or more runs of this setup were already performed on the ta

One or more runs of this setup were already performed on the task.
Run 9201845 on task 49
2019-09-02 17:30:50.631356
One or more runs of this setup were already performed on the task.
Run 6063709 on task 49
2019-09-02 17:30:53.961580
One or more runs of this setup were already performed on the task.
Run 6130768 on task 49
2019-09-02 17:30:57.082558
One or more runs of this setup were already performed on the task.
Run 8992021 on task 49
2019-09-02 17:30:59.796122
One or more runs of this setup were already performed on the task.
Run 9202222 on task 49
2019-09-02 17:31:02.516143
One or more runs of this setup were already performed on the task.
Run 8698113 on task 49
2019-09-02 17:31:05.686332
One or more runs of this setup were already performed on the task.
Run 4569642 on task 49
2019-09-02 17:31:09.987910
One or more runs of this setup were already performed on the task.
Run 8750238 on task 49
2019-09-02 17:31:12.800868
One or more runs of this setup were already performed on the tas

KeyboardInterrupt: 

## Trying to make callables work

In [17]:
def cont(X):
    return X.dtypes != 'category'

def cat(X):
    return X.dtypes == 'category'

In [42]:
# get relevant info from dataset object
X, y, _, _ = data.get_data(target=data.default_target_attribute)

type(X)

pandas.core.frame.DataFrame

In [132]:
# get task
task = openml.tasks.get_task(23)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, categorical_indicator, attribute_names = data.get_data(target=data.default_target_attribute)

cat = categorical_indicator
num = [not k for k in categorical_indicator]

def cat_call(A):
    return A.dtypes == 'category'

def num_call(A):
    return A.dtypes != 'category'

# make columntransformer
#numeric_transformer = make_pipeline(StandardScaler())
#categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
#preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, cont), 
#                                              ('cat', categorical_transformer, cat)])

preprocessor = make_column_transformer((StandardScaler(), cat_call),
                                      (OneHotEncoder(categories = 'auto'), num_call))

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(preprocessor, clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

  warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format))


PyOpenMLError: 'numpy.ndarray' object has no attribute 'dtypes'

In [133]:
%debug

> [0;32m/miniconda3/lib/python3.7/site-packages/openml/extensions/sklearn/extension.py[0m(1372)[0;36m_run_model_on_fold[0;34m()[0m
[0;32m   1370 [0;31m        [0;32mexcept[0m [0mAttributeError[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1371 [0;31m            [0;31m# typically happens when training a regressor on classification task[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1372 [0;31m            [0;32mraise[0m [0mPyOpenMLError[0m[0;34m([0m[0mstr[0m[0;34m([0m[0me[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1373 [0;31m[0;34m[0m[0m
[0m[0;32m   1374 [0;31m        [0;32mif[0m [0misinstance[0m[0;34m([0m[0mtask[0m[0;34m,[0m [0;34m([0m[0mOpenMLClassificationTask[0m[0;34m,[0m [0mOpenMLLearningCurveTask[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/miniconda3/lib/python3.7/site-packages/openml/runs/functions.py[0m(446)[0;36m_run_t

## Trying to write my own transformer

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [125]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler

transformer = Pipeline([('features', FeatureUnion(n_jobs=1, transformer_list=[
        ('bool', Pipeline([('selector', TypeSelector('bool')),])), 
        ('num', Pipeline([('selector', TypeSelector(np.number)),('scaler', StandardScaler()),])), 
        ('cat', Pipeline([('selector', TypeSelector('category')),('encoder', OneHotEncoder(handle_unknown='ignore')),]))  
    ])),
])

In [128]:
# get task
task = openml.tasks.get_task(3022)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

8234482

In [129]:
# get task
task = openml.tasks.get_task(23)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

8234482

In [127]:
# get task
task = openml.tasks.get_task(3022)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, _, _ = data.get_data(target=data.default_target_attribute)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(transformer, clf)

pipe.fit(X, y)

  warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format))


Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('features',
                                  FeatureUnion(n_jobs=1,
                                               transformer_list=[('bool',
                                                                  Pipeline(memory=None,
                                                                           steps=[('selector',
                                                                                   TypeSelector(dtype='bool'))],
                                                                           verbose=False)),
                                                                 ('num',
                                                                  Pipeline(memory=None,
                                                                           steps=[('selector',
                                                                                   TypeSe