# Best runs on all tasks

## Import modules

In [1]:
# import modules
import openml
from openml import tasks, flows, runs
import sklearn
from sklearn import feature_selection
from sklearn.svm import SVC
import pandas as pd
import pprint
from collections import OrderedDict, Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
import random
import numpy as np
from datetime import datetime
import sys
import matplotlib.pyplot as plt
import json
from itertools import combinations
from utils.functions_analyze_runs import get_run_info_svc
from dabl import detect_types
import signal

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')


In [2]:
# get all supervised classification tasks
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')
# drop problematic tasks
tasks_all = tasks_all.drop([3573, 146825, 167121, 167124])


## Get OpenML runs for SVC flows

In [3]:
# get SVC evals
good_flows = [6246, 6952, 8330, 6954, 7756, 5499, 8317, 7223, 6009, 7707, 6269, 5983, 16374, 16347, 16345]
evals = openml.evaluations.list_evaluations('area_under_roc_curve',
                                            flow= good_flows, 
                                            task=list(tasks_all.tid),
                                            output_format='dataframe'
                                            )

# rank evaluations
evals['rank'] = evals.groupby('task_id')['value'].rank('first', ascending=False)

# get best evaluations
best_evals = evals.loc[evals['rank'] <= 5]

In [4]:
# drop problematic runs
best_evals = best_evals[best_evals.run_id != 6148258]
best_evals = best_evals[best_evals.run_id != 8231647]

In [5]:
best_evals.shape

(336, 15)

## Check categorical / numerical / mixed features

In [6]:
# empty list to populate with feature types
types = []

for i in tasks_all.tid:
    print(i, '', end = '')
    
    # get task
    task = openml.tasks.get_task(i)

    # get dataset object 
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)
    
    if not any(categorical_indicator):
        types.append((i, 'numeric'))
    elif all(categorical_indicator):
        types.append((i, 'categorical'))
    else:
        types.append((i, 'mixed'))

cat_num = pd.DataFrame(types, columns = ['tid', 'cat_num'])
cat_num = pd.DataFrame(types, columns=['tid', 'cat_num'])

3 6 11 12 14 15 16 18 22 23 28 29 31 32 37 43 45 49 53 219 2074 2079 3021 3022 3481 3549 3560 3902 3903 3904 3913 3917 3918 7592 9910 9946 9952 9957 9960 9964 9971 9976 9977 9978 9981 9985 10093 10101 14952 14954 14965 14969 14970 125920 125922 146195 146800 146817 146819 146820 146821 146822 146824 167119 167120 167125 167140 167141 

In [11]:
cat_num

Unnamed: 0,tid,cat_num
0,3,categorical
1,6,numeric
2,11,numeric
3,12,numeric
4,14,numeric
...,...,...
63,167119,numeric
64,167120,numeric
65,167125,mixed
66,167140,categorical


In [8]:
# check distribution
cat_num['cat_num'].value_counts()

numeric        45
mixed          15
categorical     8
Name: cat_num, dtype: int64

In [9]:
# check ids of mixed feature tasks
list(cat_num.tid.loc[cat_num.cat_num == 'mixed'])

[23,
 29,
 31,
 219,
 2079,
 3021,
 3022,
 7592,
 9971,
 9977,
 14954,
 14965,
 125920,
 167125,
 167141]

In [10]:
task_ids = cat_num[cat_num.cat_num != 'mixed'].tid

## Loop over all tasks

In [12]:
# define timeout handler
def handler(signum, frame):
    raise Exception("Timeout!")
    
# Register the signal function handler
signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [14]:
# infinite loop
while 1:

    # randomly sample a task
    # i = tasks_all.tid.sample().iloc[0] # this samples from all tasks
    i = task_ids.sample() # sample from numeric or categorical only
    
    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in best_evals.run_id.sample(frac=1):
        
        # set time limit
        signal.alarm(3600)
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            # get params
            params = get_run_info_svc(k)

            # define classifier
            clf = SVC(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                print('Skip task with mixed features...')
                break
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=True)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            # run.push_tag('best_models')
            # publish the run
            
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


Run 10426223 on task 63    167119
Name: tid, dtype: int64
2020-01-23 22:10:44.110534


KeyboardInterrupt: 

Run 5939306 on task 3913
2019-12-03 14:20:38.101589
Publish openml run...
View run online: https://www.openml.org/r/10418929
Setup 8255506
Flow 17447

Run 5939306 on task 32
2019-12-03 14:23:50.265571
Publish openml run...
View run online: https://www.openml.org/r/10418930
Setup 8255506
Flow 17447

Run 5939306 on task 43
2019-12-03 14:25:07.758053
Publish openml run...
View run online: https://www.openml.org/r/10418931
Setup 8255506
Flow 17447



Run 5939306 on task 3
2019-12-03 14:30:01.105288
Publish openml run...
View run online: https://www.openml.org/r/10418933
Setup 8255508
Flow 17449

## Trying to make callables work

In [3]:
def cont(X):
    return X.dtypes != 'category'

def cat(X):
    return X.dtypes == 'category'

In [4]:
# get relevant info from dataset object
X, y, _, _ = data.get_data(target=data.default_target_attribute)

type(X)

NameError: name 'data' is not defined

In [13]:
cont(X)

Wifes_age                        True
Wifes_education                 False
Husbands_education              False
Number_of_children_ever_born     True
Wifes_religion                  False
Wifes_now_working%3F            False
Husbands_occupation             False
Standard-of-living_index        False
Media_exposure                  False
dtype: bool

In [5]:
# get task
task = openml.tasks.get_task(23)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, categorical_indicator, attribute_names = data.get_data(target=data.default_target_attribute, )

cat = categorical_indicator
num = [not k for k in categorical_indicator]

def cat_call(A):
    return A.dtypes == 'category'

def num_call(A):
    return A.dtypes != 'category'

# make columntransformer
#numeric_transformer = make_pipeline(StandardScaler())
#categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
#preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, cont), 
#                                              ('cat', categorical_transformer, cat)])

preprocessor = make_column_transformer((StandardScaler(), cat_call),
                                      (OneHotEncoder(categories = 'auto'), num_call))

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(preprocessor, clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

ValueError: Found unknown categories [16] in column 1 during transform

In [133]:
%debug

> [0;32m/miniconda3/lib/python3.7/site-packages/openml/extensions/sklearn/extension.py[0m(1372)[0;36m_run_model_on_fold[0;34m()[0m
[0;32m   1370 [0;31m        [0;32mexcept[0m [0mAttributeError[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1371 [0;31m            [0;31m# typically happens when training a regressor on classification task[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1372 [0;31m            [0;32mraise[0m [0mPyOpenMLError[0m[0;34m([0m[0mstr[0m[0;34m([0m[0me[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1373 [0;31m[0;34m[0m[0m
[0m[0;32m   1374 [0;31m        [0;32mif[0m [0misinstance[0m[0;34m([0m[0mtask[0m[0;34m,[0m [0;34m([0m[0mOpenMLClassificationTask[0m[0;34m,[0m [0mOpenMLLearningCurveTask[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/miniconda3/lib/python3.7/site-packages/openml/runs/functions.py[0m(446)[0;36m_run_t

## Trying to write my own transformer

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [125]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler

transformer = Pipeline([('features', FeatureUnion(n_jobs=1, transformer_list=[
        ('bool', Pipeline([('selector', TypeSelector('bool')),])), 
        ('num', Pipeline([('selector', TypeSelector(np.number)),('scaler', StandardScaler()),])), 
        ('cat', Pipeline([('selector', TypeSelector('category')),('encoder', OneHotEncoder(handle_unknown='ignore')),]))  
    ])),
])

In [8]:
# get task
task = openml.tasks.get_task(3022)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3, probability = True)
pipe = make_pipeline(clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

ValueError: could not convert string to float: 'Male'

In [5]:
# get task
task = openml.tasks.get_task(23)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3, probability=True)
pipe = make_pipeline(clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

8254405

In [6]:
# get task
task = openml.tasks.get_task(3022)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, _, _ = data.get_data(target=data.default_target_attribute)

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(transformer, clf)

pipe.fit(X, y)

NameError: name 'transformer' is not defined

In [14]:
best_evals

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data,rank
49,6013835,3,4048524,6952,sklearn.pipeline.Pipeline(imputation=openmlstu...,3,kr-vs-kp,area_under_roc_curve,2017-07-18 02:36:44,1,janvanrijn@gmail.com,0.999875,,"[0.999875,0.999875]",5.0
110,6042216,3,4076896,6952,sklearn.pipeline.Pipeline(imputation=openmlstu...,3,kr-vs-kp,area_under_roc_curve,2017-07-22 06:33:22,1,janvanrijn@gmail.com,0.999883,,"[0.999883,0.999883]",2.0
165,6045896,3,4080554,6952,sklearn.pipeline.Pipeline(imputation=openmlstu...,3,kr-vs-kp,area_under_roc_curve,2017-07-25 11:07:05,1,janvanrijn@gmail.com,0.999897,,"[0.999897,0.999897]",1.0
166,6045925,3,4080583,6952,sklearn.pipeline.Pipeline(imputation=openmlstu...,3,kr-vs-kp,area_under_roc_curve,2017-07-25 11:11:52,1,janvanrijn@gmail.com,0.999883,,"[0.999883,0.999883]",3.0
302,6056722,3,4091371,6952,sklearn.pipeline.Pipeline(imputation=openmlstu...,3,kr-vs-kp,area_under_roc_curve,2017-07-28 13:34:05,1,janvanrijn@gmail.com,0.999876,,"[0.999876,0.999876]",4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209462,9202016,167141,7131814,8330,sklearn.pipeline.Pipeline(imputation=openmlstu...,40701,churn,area_under_roc_curve,2018-05-30 17:40:00,1,janvanrijn@gmail.com,0.903748,,"[0.903748,0.903748]",2.0
209464,9202044,167141,7131842,8330,sklearn.pipeline.Pipeline(imputation=openmlstu...,40701,churn,area_under_roc_curve,2018-05-30 17:41:17,1,janvanrijn@gmail.com,0.903111,,"[0.903111,0.903111]",3.0
209465,9202052,167141,7131850,8330,sklearn.pipeline.Pipeline(imputation=openmlstu...,40701,churn,area_under_roc_curve,2018-05-30 17:41:43,1,janvanrijn@gmail.com,0.902715,,"[0.902715,0.902715]",4.0
209468,9202337,167141,7132135,8330,sklearn.pipeline.Pipeline(imputation=openmlstu...,40701,churn,area_under_roc_curve,2018-05-30 18:02:54,1,janvanrijn@gmail.com,0.906675,,"[0.906675,0.906675]",1.0


In [17]:
# task id
i = 29

# get task
task = openml.tasks.get_task(i)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                            target=data.default_target_attribute)

# mask with feature types
cat = categorical_indicator
num = [not k for k in categorical_indicator]

# create column transformers
numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                    StandardScaler())

categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                        OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, num),
('cat', categorical_transformer, cat)])


print('Get params...')

# params 
k = 6013835

# get params
params = get_run_info_svc(k)

# define classifier
clf = SVC(**params)

# pick pipeline according to feature types
if not any(categorical_indicator):
    pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
elif all(categorical_indicator):
    pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
else:
    pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)

print('Fit model...')   
    
# run best model on the task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=True)

# print feedbackack
print('Publish openml run...')

# push tag
#run.push_tag('best_models')

# publish the run 
run.publish()
# print feedback
print('View run online: https://www.openml.org/r/' + str(run.run_id))
print('Setup', openml.runs.get_run(run.run_id).setup_id)
print('Flow', openml.runs.get_run(run.run_id).flow_id)
print()

Get params...
Fit model...


OpenMLRunsExistError: One or more runs of this setup were already performed on the task.

In [None]:
Publish openml run...
View run online: https://www.openml.org/r/10417329
Setup 8254403
Flow 17337

In [None]:
Publish openml run...
View run online: https://www.openml.org/r/10417330
Setup 8254404
Flow 17337

In [6]:
X

NameError: name 'X' is not defined

In [9]:
# get task
task = openml.tasks.get_task(29)

# get dataset object 
data = openml.datasets.get_dataset(task.dataset_id)

# get relevant info from dataset object
X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='dataframe',
                                                            target=data.default_target_attribute)

In [25]:
def cat_call(X):
    return detect_types(X).categorical.values

def num_call(X):
    return detect_types(X).continuous.values


In [18]:
detect_types(X)

Unnamed: 0,continuous,dirty_float,low_card_int,categorical,date,free_string,useless
A1,False,False,False,True,False,False,False
A2,True,False,False,False,False,False,False
A3,True,False,False,False,False,False,False
A4,False,False,False,True,False,False,False
A5,False,False,False,True,False,False,False
A6,False,False,False,True,False,False,False
A7,False,False,False,True,False,False,False
A8,True,False,False,False,False,False,False
A9,False,False,False,True,False,False,False
A10,False,False,False,True,False,False,False


In [10]:
# get task
task = openml.tasks.get_task(3)

def cat_call(X):
    return X.dtypes == 'category'

def num_call(X):
    return X.dtypes != 'category'


preprocessor = make_column_transformer((StandardScaler(), cat_call), 
                                       (OneHotEncoder(categories = 'auto'), num_call))

# make pipeline
clf = SVC(gamma = 'scale', random_state=3)
pipe = make_pipeline(preprocessor, clf)

# run task
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
run.publish()

# check setup 
openml.runs.get_run(run.run_id).setup_id

ValueError: could not convert string to float: 'f'

In [3]:
# get params
params = get_run_info_svc(6013835)

In [4]:
params

{'C': 0.07077231909653779,
 'coef0': 0.9005967890758899,
 'degree': 4,
 'gamma': 2.867595836610148,
 'kernel': 'poly',
 'max_iter': -1,
 'random_state': 1,
 'probability': True}