In [None]:
# imports
import json
import matplotlib.pyplot as plt
import numpy as np
import openmlpimp
import os
import pandas as pd
import pickle

# mapping from casual name to directory where it is saved
classifiers = {
  #'random forest': '6969/vanilla/', 
  'adaboost':      '6970/vanilla', 
  'svm (rbf)':     '7707/kernel_rbf', 
  'svm (sigmoid)': '7707/kernel_sigmoid'
}

%matplotlib inline

# Marginal Contribution

In [None]:
def marginal_plots(sorted_values, keys, fig_title):
    plt.figure()
    plt.violinplot(list(sorted_values), list(range(len(sorted_values))))
    plt.plot([-0.5, len(sorted_values) - 0.5], [0, 0], 'k-', linestyle='--', lw=1)
    plt.xticks(list(range(len(sorted_values))), list(keys), rotation=45, ha='right')
    plt.ylabel('marginal contribution')
    # plt.title(fig_title)
    print(fig_title)
    plt.show()
    plt.close()


def determine_relevant(data, max_items=None, max_interactions=None):
    from statistics import median
    
    sorted_values = []
    keys = []
    interactions_seen = 0
    for key in sorted(data, key=lambda k: median(data[k]), reverse=True):
        if '__' in key:
            interactions_seen += 1
            if interactions_seen > max_interactions:
                continue

        sorted_values.append(data[key])
        keys.append(key)

    if max_items is not None:
        sorted_values = sorted_values[:max_items]
        keys = keys[:max_items]

    return sorted_values, keys

In [None]:
for classifier, directory_suffix in classifiers.items():
    total_ranks, marginal_contribution, _ = openmlpimp.utils.obtain_marginal_contributions('data/fanova/' + directory_suffix)
    sorted_values, keys = determine_relevant(marginal_contribution, max_interactions=3)
    marginal_plots(sorted_values, keys, classifier)

# Most important hyperparameter per dataset

In [None]:
x_axis_feature = 'NumberOfInstances'
y_axis_feature = 'NumberOfFeatures'

for classifier, directory_suffix in classifiers.items():
    x_vals = {}
    y_vals = {}
    area = {}
    
    directory = 'data/fanova/' + directory_suffix
    task_qualities = json.load(open('data/fanova/task_qualities.json', 'r'))
    for task_id in os.listdir(directory):
        task_dir = os.path.join(directory, task_id)
        if not os.path.isdir(task_dir):
            continue
        pimp_file = os.path.join(task_dir, 'pimp_values_fanova.json')
        interaction_file = os.path.join(task_dir, 'pimp_values_fanova_interaction.json')
        
        if not (os.path.isfile(pimp_file) and os.path.isfile(interaction_file)):
            continue
        
        hyperparameters = json.load(open(pimp_file, 'r'))
        hyperparameters.update(json.load(open(interaction_file, 'r')))
        
        most_important = max(hyperparameters, key=hyperparameters.get) 
        value = hyperparameters[most_important]
        
        if most_important not in x_vals:
            x_vals[most_important] = []
            y_vals[most_important] = []
            area[most_important] = []
        x_vals[most_important].append(float(task_qualities[task_id][x_axis_feature]))
        y_vals[most_important].append(float(task_qualities[task_id][y_axis_feature]))
        area[most_important].append(float(value) * 50)
    
    plotted_items = []
    legend_keys = []
    for param in x_vals.keys():
        occurances = len(x_vals[param])
        current = plt.scatter(x_vals[param], y_vals[param], s=area[param], alpha=0.9)
        plotted_items.append(current)
        legend_keys.append(param)
    

    legend = plt.legend(plotted_items, legend_keys, scatterpoints=1, loc='upper right')
    for idx in range(len(plotted_items)):
        legend.legendHandles[idx]._sizes = [50]
    
    print(classifier)
    # dimensions of the datasets
    plt.axis((450,100000,3,2100))
    plt.xscale("log")
    plt.yscale("log")

    plt.xlabel(x_axis_feature, fontsize='xx-large')
    plt.ylabel(y_axis_feature, fontsize='xx-large')
    plt.show()

# Verification
The plots in the submission are generated with an external plotting library. To keep the notebook understandable, we will just plot the raw results

In [None]:
# TODO

# Prior experiments

In [None]:
def plot_violin(results):
    data = []
    kde_wins = 0
    uni_wins = 0
    draws = 0
    df = pd.DataFrame(columns=['A','B','C','D','E','F','G'])
    for task_id in results:
        if len(results[task_id]) == 2 and 'uniform' in results[task_id] and 'kde' in results[task_id] and len(results[task_id]['kde']) > 0 and len(results[task_id]['uniform']) > 0:
            scores_kde = results[task_id]['kde']
            scores_uniform = results[task_id]['uniform']

            current = sum(scores_kde.values()) / len(scores_kde) - sum(scores_uniform.values()) / len(scores_uniform)
            data.append(current)

    plt.figure(figsize=(4, 12))
    plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    plt.plot([0.5, 1.5], [0, 0], 'k-', linestyle='--', lw=1)
    plt.violinplot(data)
    plt.show()
    plt.close()


for classifier, directory_suffix in classifiers.items():
    directory = 'data/priors/' + directory_suffix
    if not os.path.isfile(directory + '/cache_test.pkl'):
        raise ValueError('Could not find cache file:', directory + '/cache_test.pkl')
    cache_results_test = pickle.load(open(directory + '/cache_test.pkl', 'rb'))
    print(classifier)
    plot_violin(cache_results_test)