In [2]:
from requests import get
import numpy as np
import pandas as pd
import json
from os import listdir
import seaborn as sns
import matplotlib.pyplot as plt
%load_ext autotime
%matplotlib inline  

In [None]:
"""
Workflow:
- Only look at data from supervised classification tasks from OpenML
- Get all data relating to when a sklearn primitive is used
   - Looking at all supervised classification tasks, get all flows
   - Only look at tasks where a sklearn flow is used
   - Get run information for sklearn runs for those tasks
   - Get dataset information for those tasks
   - Get accuracy information for those runs
- Get all data relating to when a weka primitive is used
   - Only use tasks where sklearn flow is used to keep dataset consistent
   - Only look at tasks where a sklearn flow is used
   - Get run information for sklearn runs for those tasks
   - Get accuracy information for those runs
- Get dataset qualities and features
"""

In [2]:
# API Key for downloading data from OpenML
api_key = "1dbab546719d8085308a5bfde18381d7"

time: 3.58 ms


In [4]:
# Download all the flows from OpenML and save all flow names by there id
all_flows = get("https://www.openml.org/api/v1/json/flow/list?api_key=" + api_key).json()
flow_id_names = {}
for flow in all_flows['flows']['flow']:
    flow_id_names[flow['id']] = flow['name']
    
# with open('OpenML_Data/flow_id_names.json', 'w') as outfile:
#     json.dump(flow_id_names, outfile)

time: 5.04 ms


In [5]:
# Download all the tasks from OpenML and get all task ids for superivsed classification
all_tasks = get("https://www.openml.org/api/v1/json/task/list?api_key=" + api_key).json()
supervised_classification_task_ids = []
for task in all_tasks['tasks']['task']:
    if task['task_type'] == 'Supervised Classification':
        supervised_classification_task_ids.append(task['task_id'])

time: 4.38 s


In [8]:
# Get all runs for all supervised classification tasks and get all unique flow ids used
supervised_classification_flow_ids = []
for task_id in supervised_classification_task_ids:
    try:
        runs = get("https://www.openml.org/api/v1/json/run/list/task/" + str(task_id) + "/limit/10000?api_key=" + api_key)
        runs = runs.json()
    except:
        runs = json.loads(runs.text.replace('\n', ''))
    if 'error' not in runs:
        for run in runs['runs']['run']:
            supervised_classification_flow_ids.append(run['flow_id'])
        supervised_classification_flow_ids = list(set(supervised_classification_flow_ids))

time: 4.68 ms


In [None]:
# Get the flow descriptions for all extracted flow ids
supervised_classification_flow_descriptions = []
for flow in supervised_classification_flow_ids[1:]:
    flow_description = get("https://www.openml.org/api/v1/json/flow/" + str(flow) + "?api_key=" + api_key).json()
    if 'error' not in flow_description:
        supervised_classification_flow_descriptions.append(flow_description)

In [None]:
### Sklearn Download

In [45]:
# Get all the flow ids for sklearn flows
sklearn_flows = []
for flow in supervised_classification_flow_descriptions:
    if flow['flow']['name'][0:7] == 'sklearn':
        sklearn_flows.append(flow['flow']['id'])

time: 6.08 ms


In [None]:
# Get all flow descriptions for sklearn flows and runs for each sklearn flow
num_flows = len(sklearn_flows)
sklearn_flow_descriptions = {}
sklearn_runs_info = {}
for flow in sklearn_flows:
    runs_info = get("https://www.openml.org/api/v1/json/run/list/flow/" + flow + "?api_key=" + api_key).json()
    flow_description = get("https://www.openml.org/api/v1/json/flow/" + flow + "?api_key=" + api_key).json()
    if ('error' not in runs_info) and ('error' not in flow_description):
        sklearn_runs_info[flow] = runs_info
        sklearn_flow_descriptions[flow] = flow_description
        
# with open('OpenML_Data/sklearn_flow_descriptions.json', 'w') as outfile:
#     json.dump(sklearn_flow_descriptions, outfile)
# with open('OpenML_Data/sklearn_runs_info.json', 'w') as outfile:
#     json.dump(sklearn_runs_info, outfile)

In [78]:
# Get all unique task and setup ids for sklearn runs
sklearn_task_ids = []
sklearn_setup_ids = []
for flow,runs in sklearn_runs_info.items():
    for run in runs['runs']['run']:
        sklearn_task_ids.append(run['task_id'])
        sklearn_setup_ids.append(run['setup_id'])
sklearn_task_ids = list(set(sklearn_task_ids))
sklearn_setup_ids = list(set(sklearn_setup_ids))

time: 58.8 ms


In [None]:
# Get task descriptions for all tasks with any runs that use a sklearn flow
sklearn_task_descriptions = {}
for task_id in sklearn_task_ids:
    task_description = get("https://www.openml.org/api/v1/json/task/" + str(task_id) + "?api_key=" + api_key).json()
    sklearn_task_descriptions[task_id] = task_description
    
# with open('OpenML_Data/sklearn_task_descriptions.json', 'w') as outfile:
#     json.dump(sklearn_task_descriptions, outfile)

In [None]:
# Get the setup descriptions for all sklearn setup ids
sklearn_setup_descriptions = {}
for setup_id in sklearn_setup_ids:
    setup_description = get("https://www.openml.org/api/v1/json/setup/" + str(setup_id) + "?api_key=" + api_key).json()
    sklearn_setup_descriptions[setup_id] = setup_description
    
# with open('OpenML_Data/sklearn_setup_descriptions.json', 'w') as outfile:
#     json.dump(sklearn_setup_descriptions, outfile)

In [143]:
# Get the dataset ids for all sklearn tasks
sklearn_data_set_ids = []
for task_id,task_description in sklearn_task_descriptions.items():
    data_set_id = task_description['task']['input'][0]['data_set']['data_set_id']
    sklearn_data_set_ids.append(data_set_id)

time: 4.6 ms


In [None]:
# Get the dataset descriptions for all sklearn datasets
sklearn_data_set_descriptions = {}
for data_set_id in sklearn_data_set_ids:
    data_set_description = get("https://www.openml.org/api/v1/json/data/" + str(data_set_id) + "?api_key=" + api_key).json()
    sklearn_data_set_descriptions[data_set_id] = data_set_description
    
# with open('OpenML_Data/sklearn_data_set_descriptions.json', 'w') as outfile:
#     json.dump(sklearn_data_set_descriptions, outfile)

In [None]:
# Get the run descriptions for all sklearn runs (takes very long)
sklearn_run_descriptions = {}
for flow,runs in sklearn_runs_info.items():
    for run in runs['runs']['run']:
        run_id = run['run_id']
        run_description = get("https://www.openml.org/api/v1/json/run/" + str(run_id) + "?api_key=" + api_key).json()
        sklearn_run_descriptions[run_id] = run_description
        
# with open('OpenML_Data/sklearn_run_descriptions.json', 'w') as outfile:
#     json.dump(sklearn_run_descriptions, outfile)

In [19]:
# Get all sklearn primitives for each run
flows_per_run = {}
for run_id,run_description in sklearn_run_descriptions.items():
    flow_id = run_description['run']['flow_id']
    flow_names = []
    flow_description = sklearn_flow_descriptions[flow_id]['flow']
    if 'component' in flow_description:
        compnents = flow_description['component']
        if type(compnents) == dict:
            flow_names.append(compnents['flow']['class_name'])
            flows_per_run[run_id] = flow_names
        else:
            for flow in flow_description['component']:
                flow_names.append(flow['flow']['class_name'])
            flows_per_run[run_id] = flow_names
    else:
        flows_per_run[run_id] = [flow_description['name']]

time: 5.08 s


In [20]:
# Get the number of times each sklearn flow is run
flow_names = {}
for run,flows in flows_per_run.items():
    for flow in flows:
        if flow in flow_names:
            flow_names[flow] += 1
        else:
            flow_names[flow] = 1

time: 36.1 ms


In [21]:
# Load the accuracy of all runs for all sklearn tasks (computed on brown ccv)
task_accuracies = {}
for file in listdir("OpenML_Data/accuracy_results/results")[1:]:
    with open('OpenML_Data/accuracy_results/results/' + file) as data_file:
        acc = json.load(data_file)
    task_accuracies.update(acc)

time: 36.7 ms


In [25]:
# Get the adjusted average accuracy (flow accuracy minus mean task accuracy) for all sklearn flows
sklearn_flow_accuracies = {}
for run_id, run_description in sklearn_run_descriptions.items():
    for metric in run_description['run']['output_data']['evaluation']:
        if metric['name'] == 'predictive_accuracy':
            accuracy = float(metric['value'])
            try:
                avg_task_acc = task_accuracies[run_description['run']['task_id']]
                adjusted_accuracy = (accuracy - avg_task_acc)/avg_task_acc
                for flow in flows_per_run[run_id]:
                    if flow in sklearn_flow_accuracies:
                        sklearn_flow_accuracies[flow].append(adjusted_accuracy)
                    else:
                        sklearn_flow_accuracies[flow] = [adjusted_accuracy]
                break
            except:
                pass
            
# with open('OpenML_Data/sklearn_flow_accuracies.json', 'w') as outfile:
#     json.dump(sklearn_flow_accuracies, outfile)

time: 3.64 s


In [46]:
# Create dataframe of average sklearn flow adjusted accuracies
mean_flow_accuracies = pd.DataFrame(columns=['Name', 'Adj_Accuracy', 'Count', 'sd'])
for flow,accuracies in sklearn_flow_accuracies.items():
    mean_flow_accuracies = mean_flow_accuracies.append(pd.DataFrame([[flow,np.asarray(accuracies).astype(np.float).mean(), len(accuracies), np.asarray(accuracies).astype(np.float).std()]], columns=['Name', 'Adj_Accuracy', 'Count', 'sd']))
mean_flow_accuracies = mean_flow_accuracies[mean_flow_accuracies['Count'] >= 10]

time: 595 ms


In [3]:
# Reformatting flow accuracies for plotting
flow_accuracies_df = pd.DataFrame(columns=['Name', 'Adj_Accuracy'])
for flow,accuracies in sklearn_flow_accuracies.items():
    for accuracy in accuracies:
        flow_accuracies_df = flow_accuracies_df.append(pd.DataFrame([[flow,accuracy]], columns=['Name', 'Adj_Accuracy']))
flow_accuracies_df = flow_accuracies_df[flow_accuracies_df.groupby('Name').Adj_Accuracy.transform(len) > 10]

# flow_accuracies_df.to_csv("OpenML_Data/sklearn_flow_accuracies.csv")

In [5]:
# Load data 
with open('OpenML_Data/sklearn_run_descriptions.json') as data_file:
    sklearn_run_descriptions = json.load(data_file)
with open('OpenML_Data/flow_descriptions.json') as data_file:
    flow_descriptions = json.load(data_file)

time: 22.8 s


In [None]:
### Weka Download

In [15]:
# Get all flows that are weka (java)
weka_flows = []
for flow in flow_descriptions:
    if flow['flow']['name'][0:4] == 'weka':
        weka_flows.append(flow['flow']['id'])

time: 9.91 ms


In [None]:
# Get all flow descriptions for weka flows and runs for each weka flow (23 minutes)
weka_flow_descriptions = {}
weka_runs_info = {}
for flow in weka_flows:
    runs_info = get("https://www.openml.org/api/v1/json/run/list/flow/" + flow + "?api_key=" + api_key).json()
    flow_description = get("https://www.openml.org/api/v1/json/flow/" + flow + "?api_key=" + api_key).json()
    if ('error' not in runs_info) and ('error' not in flow_description):
        weka_runs_info[flow] = runs_info
        weka_flow_descriptions[flow] = flow_description
        
# with open('OpenML_Data/weka_flow_descriptions.json', 'w') as outfile:
#     json.dump(weka_flow_descriptions, outfile)
# with open('OpenML_Data/weka_runs_info.json', 'w') as outfile:
#     json.dump(weka_runs_info, outfile)

In [20]:
# Get all unique task and setup ids for weka runs
weka_task_ids = []
weka_setup_ids = []
for flow,runs in weka_runs_info.items():
    for run in runs['runs']['run']:
        weka_task_ids.append(run['task_id'])
        weka_setup_ids.append(run['setup_id'])
weka_task_ids = list(set(weka_task_ids))
weka_setup_ids = list(set(weka_setup_ids))

time: 210 ms


In [None]:
# # Get the setup descriptions for all weka setup ids (55 minutes)
weka_setup_descriptions = {}
for setup_id in weka_setup_ids:
    setup_description = get("https://www.openml.org/api/v1/json/setup/" + str(setup_id) + "?api_key=" + api_key).json()
    weka_setup_descriptions[setup_id] = setup_description
    
# with open('OpenML_Data/weka_setup_descriptions.json', 'w') as outfile:
#     json.dump(weka_setup_descriptions, outfile)

In [34]:
# Get all task ids for sklearn tasks
with open('OpenML_Data/sklearn_task_descriptions.json') as data_file:
    sklearn_task_descriptions = json.load(data_file)
sklearn_task_ids = []
for _,task in sklearn_task_descriptions.items():
    sklearn_task_ids.append(task['task']['task_id'])

time: 3.61 ms


In [None]:
### The following code was run on Brown CCV enironment

In [None]:
# Get the run descriptions for all weka runs (26 hours)
weka_run_descriptions = {}
for flow,runs in weka_runs_info.items():
    for run in runs['runs']['run']:
        if str(run['task_id']) in sklearn_task_ids:
            run_id = run['run_id']
            run_description = get("https://www.openml.org/api/v1/json/run/" + str(run_id) + "?api_key=" + api_key).json()
            weka_run_descriptions[run_id] = run_description
            
# with open('OpenML_Data/weka_run_descriptions.json', 'w') as outfile:
#     json.dump(weka_run_descriptions, outfile)

In [None]:
# Get all weka primitives for each run
flows_per_run = {}
for run_id,run_description in weka_run_descriptions.items():
    flow_id = run_description['run']['flow_id']
    flow_names = []
    flow_description = weka_flow_descriptions[flow_id]['flow']
    name = flow_description['name']
    flow_names.append(name)
    flows_per_run[run_id] = flow_names

In [None]:
# Get the number of times each weka flow is run
flow_names = {}
for run,flows in flows_per_run.items():
    for flow in flows:
        if flow in flow_names:
            flow_names[flow] += 1
        else:
            flow_names[flow] = 1

In [None]:
# Get the adjusted average accuracy (flow accuracy minus mean task accuracy) for all weka flows
weka_flow_accuracies = {}
for run_id, run_description in weka_run_descriptions.items():
    for metric in run_description['run']['output_data']['evaluation']:
        if metric['name'] == 'predictive_accuracy':
            accuracy = float(metric['value'])
            try:
                avg_task_acc = task_accuracies[run_description['run']['task_id']]
                adjusted_accuracy = (accuracy - avg_task_acc)/avg_task_acc
                for flow in flows_per_run[run_id]:
                    if flow in weka_flow_accuracies:
                        weka_flow_accuracies[flow].append(adjusted_accuracy)
                    else:
                        weka_flow_accuracies[flow] = [adjusted_accuracy]
                break
            except:
                pass

In [26]:
# Read in weka flow accuracies if not already in the environment
with open('OpenML_Data/weka_flow_accuracies.json') as data_file:
    weka_flow_accuracies = json.load(data_file)

time: 128 ms


In [32]:
# Create dataframe of average weka flow adjusted accuracies
mean_flow_accuracies = pd.DataFrame(columns=['Name', 'Adj_Accuracy', 'Count', 'sd'])
for flow,accuracies in weka_flow_accuracies.items():
    mean_flow_accuracies = mean_flow_accuracies.append(pd.DataFrame([[flow,np.asarray(accuracies).astype(np.float).mean(), len(accuracies), np.asarray(accuracies).astype(np.float).std()]], columns=['Name', 'Adj_Accuracy', 'Count', 'sd']))
mean_flow_accuracies = mean_flow_accuracies[mean_flow_accuracies['Count'] >= 1000]

# mean_flow_accuracies.to_csv('OpenML_Data/weka_mean_flow_accuracies.csv')

time: 1.69 s


In [47]:
# Reformatting flow accuracies for plotting (9 minutes)
flow_accuracies_df = pd.DataFrame(columns=['Name', 'Adj_Accuracy'])
for flow,accuracies in weka_flow_accuracies.items():
    for accuracy in accuracies:
        flow_accuracies_df = flow_accuracies_df.append(pd.DataFrame([[flow,accuracy]], columns=['Name', 'Adj_Accuracy']))
flow_accuracies_df = flow_accuracies_df[flow_accuracies_df.groupby('Name').Adj_Accuracy.transform(len) > 1000]

# flow_accuracies_df.to_csv("OpenML_Data/weka_flow_accuracies_df.csv")

time: 7min 15s


In [None]:
### Dataset download

In [3]:
# Load data
with open('OpenML_Data/sklearn_task_descriptions.json') as data_file:
    sklearn_task_descriptions = json.load(data_file)  
with open('OpenML_Data/sklearn_data_set_descriptions.json') as data_file:
    sklearn_data_set_descriptions = json.load(data_file)

time: 66.7 ms


In [23]:
# Get all sklearn data set ids and file ids
data_set_ids = []
data_set_file_ids = {}
for task_id,task_description in sklearn_task_descriptions.items():
    data_set_id = task_description['task']['input'][0]['data_set']['data_set_id']
    data_set_file_id = sklearn_data_set_descriptions[data_set_id]['data_set_description']['file_id']
    data_set_ids.append(data_set_id)
    data_set_file_ids[data_set_id] = data_set_file_id

time: 8.11 ms


In [1]:
# Download all sklearn dataset qualities and features from OpenML (run on Brown CCV enviornment, takes very long)
num_data_sets = len(data_set_ids)
sklearn_data_set_features = {}
sklearn_data_set_qualities = {}
for data_set_id in data_set_ids:
    url='https://www.openml.org/data/get_csv/' + str(data_set_file_ids[data_set_id])
    response = get(url).content
    with open("Data/" + str(data_set_id) + ".csv", 'wb') as f:
        f.write(response)
    qualities = get("https://www.openml.org/api/v1/json/data/qualities/" + str(data_set_id) + "?api_key=" + api_key).json()
    try:
        features = get("https://www.openml.org/api/v1/json/data/features/" + str(data_set_id) + "?api_key=" + api_key).json()
    except:
        features = {}
        print("Failed on data set id =" + str(data_set_id))
    sklearn_data_set_qualities[data_set_id] = qualities
    sklearn_data_set_features[data_set_id] = features
    
# with open('OpenML_Data/sklearn_data_set_qualities.json', 'w') as outfile:
#     json.dump(sklearn_data_set_qualities, outfile)

# with open('OpenML_Data/sklearn_data_set_features.json', 'w') as outfile:
#     json.dump(sklearn_data_set_features, outfile)