# Run missing models

## Import modules

In [3]:
import openml
from openml import tasks, flows, runs
import sklearn
from sklearn import feature_selection
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pprint
from collections import OrderedDict, Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
import random
import numpy as np
from datetime import datetime
import sys
import json
from itertools import combinations
import signal

import importlib
import utils.functions_analyze_runs
importlib.reload(utils.functions_analyze_runs)
importlib.reload(sklearn)
from utils.functions_analyze_runs import get_run_info_rf, get_run_info

# set api key
openml.config.apikey = open('.key', 'r').readline().strip('\n')


In [4]:
missing = pd.read_csv('rf_missing.csv').iloc[:,[1,2]]

In [5]:
missing

Unnamed: 0,setup_id,task_id
0,8261526,45
1,8261527,45
2,8261528,45
3,8261540,45
4,8261544,45
...,...,...
1337,8262187,167140
1338,8262195,167140
1339,8262202,167140
1340,8262208,167140


## Iterate through missing runs in random order

In [6]:
missing = missing.sample(frac=1)

In [7]:
# infinite loop
while 1:

    miss = missing.sample()
    
    i = miss.iloc[0,1]
    i = 3481

    # get task
    task = openml.tasks.get_task(i)
    
    # get dataset object
    data = openml.datasets.get_dataset(task.dataset_id)

    # get relevant info from dataset object
    X, y, categorical_indicator, attribute_names = data.get_data(dataset_format='array',
                                                                target=data.default_target_attribute)

    # mask with feature types
    cat = categorical_indicator
    num = [not k for k in categorical_indicator]

    # create column transformers
    numeric_transformer = make_pipeline(#SimpleImputer(strategy='median'), 
                                        StandardScaler())

    categorical_transformer = make_pipeline(#SimpleImputer(strategy='most_frequent'),
                                            OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num),
    ('cat', categorical_transformer, cat)])
    
    # loop over runs in random order
    for k in miss[miss['task_id'] == i].setup_id:
        
        # set time limit
        signal.alarm(3600)
        
        print('Run', k, 'on task', i)
        print(datetime.now())
        
        try:
            # get params
            params = openml.setups.initialize_model(k).steps[2][1].get_params()

            # define classifier
            clf = RandomForestClassifier(**params)

            # pick pipeline according to feature types
            if not any(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), clf)
            elif all(categorical_indicator):
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'), clf)
            else:
                pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), preprocessor, clf)
                
            # run best model on the task
            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=True)

            # print feedbackack
            print('Publish openml run...')

            # push tag
            # run.push_tag('best_models')
            # publish the run
            
            run.publish()
            # print feedback
            print('View run online: https://www.openml.org/r/' + str(run.run_id))
            print('Setup', openml.runs.get_run(run.run_id).setup_id)
            print('Flow', openml.runs.get_run(run.run_id).flow_id)
            print()

        except Exception as e:
            print(e)


Run 8261570 on task 3481
2020-05-18 21:23:25.771525


KeyboardInterrupt: 