# Run missing models

## Import modules

In [6]:
import openml
from openml import tasks, flows, runs
import sklearn
from sklearn import feature_selection
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pprint
from collections import OrderedDict, Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
import random
import numpy as np
from datetime import datetime
import sys
import json
from itertools import combinations
import signal

import importlib
import utils.functions_analyze_runs
importlib.reload(utils.functions_analyze_runs)
importlib.reload(sklearn)
from utils.functions_analyze_runs import get_run_info_rf, get_run_info

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')


In [3]:
missing = pd.read_csv('rf_missing.csv').iloc[:,[1,2]]

In [4]:
missing

Unnamed: 0,setup_id,task_id
0,8261487,3021
1,8261494,3021
2,8261509,3021
3,8261514,3021
4,8261516,3021
...,...,...
940,8262202,167125
941,8262208,167125
942,8262236,167125
943,8262261,167125


## Iterate through missing runs in random order

In [7]:
# infinite loop
for i in [3021]:

    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in missing[missing['task_id'] == i].setup_id:
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            # get params
            params = openml.setups.initialize_model(k).steps[2][1].get_params()

            # define classifier
            clf = RandomForestClassifier(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            # run.push_tag('best_models')
            # publish the run
            
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


Run 8261487 on task 3021
2020-08-11 19:04:50.235504
boolean index did not match indexed array along dimension 0; dimension is 28 but corresponding boolean dimension is 29
Run 8261494 on task 3021
2020-08-11 19:04:50.360689
boolean index did not match indexed array along dimension 0; dimension is 28 but corresponding boolean dimension is 29
Run 8261509 on task 3021
2020-08-11 19:04:50.476811
boolean index did not match indexed array along dimension 0; dimension is 28 but corresponding boolean dimension is 29
Run 8261514 on task 3021
2020-08-11 19:04:50.579511
boolean index did not match indexed array along dimension 0; dimension is 28 but corresponding boolean dimension is 29
Run 8261516 on task 3021
2020-08-11 19:04:50.693682
boolean index did not match indexed array along dimension 0; dimension is 28 but corresponding boolean dimension is 29
Run 8261517 on task 3021
2020-08-11 19:04:50.820064
boolean index did not match indexed array along dimension 0; dimension is 28 but correspondin

KeyboardInterrupt: 