# Run missing models

## Import modules

In [1]:
import openml
import json
import collections
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.compose import make_column_transformer, ColumnTransformer
from matplotlib import pyplot as plt
from utils.functions_analyze_runs import get_run_info_svc
from datetime import datetime
import signal


# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')

# get all supervised classification tasks 
tasks_all = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe', tag = 'OpenML-CC18')

In [2]:
missing = pd.read_csv('svc_missing.csv').iloc[:,[1,2]]

In [3]:
missing

Unnamed: 0,setup_id,task_id
0,8255924,15
1,8255509,18
2,8255534,18
3,8255535,18
4,8255559,18
...,...,...
1353,8255930,167120
1354,8255938,167120
1355,8255939,167120
1356,8255951,167120


In [4]:
missing.task_id.unique()

array([    15,     18,     22,     32,     37,     43,    219,   3903,
         3904,   3917,   7592,   9952,   9960,   9976,   9977,   9985,
        10093,  10101,  14965,  14969, 146195, 146822, 167119, 167120])

## Iterate through missing runs in random order

In [5]:
# define timeout handler
def handler(signum, frame):
    raise Exception("Timeout!")
    
# Register the signal function handler
signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [None]:
# infinite loop
for i in [    15,     18,     22,     32,     37,     43,    219,   3903,
         3904,   3917,   7592,   9952,   9960,   9976,   9977,   9985,
        10093,  10101,  14965,  14969, 146195, 146822, 167119, 167120]:

    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in missing[missing['task_id'] == i].setup_id.sample(frac=1):
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            
            # set time limit
            signal.alarm(600)
            
            # get params
            params = openml.setups.initialize_model(k).steps[2][1].get_params()

            # define classifier
            clf = SVC(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            # run.push_tag('best_models')
            # publish the run
            
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


Run 8255924 on task 15
2020-08-17 13:12:57.314017
Publish openml run...
View run online: https://www.openml.org/r/10559658
Setup 8270498
Flow 17494

Run 8255559 on task 18
2020-08-17 13:13:00.006518


In [None]:
8255692
8255559
8255736