In [None]:
import operator
from itertools import accumulate
from os import listdir
from os.path import splitext, join as join_path, exists as file_exists

import numpy as np
import pandas as pd
from bokeh.models import HoverTool, CDSView, BooleanFilter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.resources import INLINE
from bokeh.io import output_notebook, show

from classifiers import RegretTrial
from colors import from_file as colors_from_file
from images import ImageUtils, NeuralNetwork, ImageDataset

output_notebook(resources=INLINE)

## Functions

In [None]:
def create_constant_list(value, size):
    return size * [value]


def trials_to_dataframe(trials):
    return pd.DataFrame({
        'trial_label': [trial.get_persistent_id() for trial in trials],
        'regret': [trial.mean_regret() for trial in trials],
        'accuracy': [trial.mean_accuracy() for trial in trials],
    })


def trials_to_label_dataframe(trials):
    data = []
    for trial in trials:
        summary = trial.load_summary()
        dataset_size = sum(sum(summary[old_label].values()) for old_label in trial.old_ys())
        for new_label in trial.new_ys():
            new_label_size = sum(misclass_dict.get(new_label, 0) for misclass_dict in summary.values())
            searched = [0] + list(accumulate(
                [
                    sum(summary[old_label].values()) / dataset_size
                    for old_label, _ in trial.label_distance_order(new_label)
                ],
                operator.add
            ))
            found = [0] + list(accumulate(
                [
                    summary[old_label].get(new_label, 0) / new_label_size 
                    for old_label, _ in trial.label_distance_order(new_label)
                ],
                operator.add
            ))
            persistent_id = len(found) * [trial.get_persistent_id()]
            data.extend(zip(persistent_id, searched, found))
    return pd.DataFrame(data, columns=['trial_label', 'searched', 'found'])


def trials_to_dataframe_all(trials):
    trial_dfs = []
    for trial in trials:
        summary = trial.load_summary()
        dataset_size = sum(sum(summary.get(old_label, {}).values()) for old_label in trial.old_ys())
        persistent_id = trial.get_persistent_id()
        label_dfs = []
        for new_label in trial.new_ys():
            new_label_size = sum(misclass_dict.get(new_label, 0) for misclass_dict in summary.values())
            distance_order = [old_label for old_label, _ in trial.label_distance_order(new_label)]
            # calculate the cumulative sums first
            searched = [0] + list(accumulate(
                [sum(summary.get(old_label, {}).values()) / dataset_size for old_label in distance_order],
                operator.add,
            ))
            found = [0] + list(accumulate(
                [summary.get(old_label, {}).get(new_label, 0) / new_label_size for old_label in distance_order],
                operator.add,
            ))
            # calculate other non-constant columns
            order = list(range(len(found)))
            old_labels = [np.nan] + distance_order
            accuracy = [np.nan] + [trial.label_accuracy(old_label) for old_label in distance_order]
            # create label dataframe
            label_df = pd.DataFrame(
                list(zip(
                    order, 
                    old_labels, 
                    searched, found,
                    accuracy,
                )),
                columns=['order', 'old_label', 'searched', 'found', 'label_accuracy'],
            )
            # calculate constant columns
            label_df['trial_label'] = persistent_id
            label_df['new_labels'] = new_label
            label_df['label_mean_regret'] = trial.label_mean_regret(new_label)
            label_df['label_max_regret'] = trial.label_max_regret(new_label)
            # add dataframe to list
            label_dfs.append(label_df)
        # calculate trial-level data
        trial_df = pd.concat(label_dfs)
        trial_df['trial_mean_regret'] = trial_df['label_mean_regret'].mean()
        trial_df['trial_mean_accuracy'] = trial_df['label_accuracy'].mean()
        # add to dataframes
        trial_dfs.append(trial_df)
    return pd.concat(trial_dfs)


def create_image_tribulation(directory, dataset_str):
    assert dataset_str in ['cifar10', 'cifar100']
    df_filename = 'tribulations/' + '_'.join([directory, dataset_str]) + '.csv'
    if file_exists(df_filename):
        return pd.read_csv(df_filename)
    else:
        utils = ImageUtils(dataset_str)
        dataset = ImageDataset(dataset_str)
        trials = []
        for filename in sorted(set(f for f in listdir(directory))):
            if not filename.endswith('hdf5'):
                continue
            path = join_path(directory, filename)
            print(path)
            classifier = NeuralNetwork(path)
            regret_trial = RegretTrial(classifier, utils, dataset, path_prefix=directory)
            trials.append(regret_trial)
        df = trials_to_dataframe_all(trials)
        df.to_csv(df_filename)
        return df


def plot_tribulation(df):
    source = ColumnDataSource(df)
    tools = ['box_select']
    roc_fig = figure(
        width=400, height=400,
        title='CIFAR 100 Pilot',
        x_axis_label='Proportion of All Data Searched',
        y_axis_label='Proportion of Label Data Found',
        tools=tools,
    )
    for trial in df['trial_label'].unique():
        roc_fig.line(
            x='searched', 
            y='found',
            selection_color='red',
            source=source,
            view=CDSView(
                source=source,
                filters=[
                    BooleanFilter([label == trial for label in source.data['trial_label']]),
                ],
            ),
        )
    regret_fig = figure(
        width=400, height=400,
        title='CIFAR 100 Pilot Trials',
        x_axis_label='Classifier Test Accuracy',
        y_axis_label='Absolute Regret (lower is better)',
        tools=tools,
    )
    renderer = regret_fig.x(
        x='trial_mean_accuracy',
        y='trial_mean_regret',
        selection_color='red',
        source=source,
    )
    regret_fig.add_tools(HoverTool(renderers=[renderer], tooltips=[
        ('trial', '@trial_label'),
        ('accuracy', '@trial_mean_accuracy'),
        ('regret', '@trial_mean_regret'),
    ]))
    figures = [roc_fig, regret_fig]
    show(gridplot([figures]))

## Color Domain

## CIFAR-10, Three Labels Complete Sweep

### Load Data

In [None]:
create_image_tribulation('cifar-10', 'cifar10').head()

### Plot Regret

In [None]:
plot_tribulation(create_image_tribulation('cifar-10', 'cifar10'))

## CIFAR-100 Pilot

### Load Data

In [None]:
create_image_tribulation('cifar-100-pilot', 'cifar100').head()

### Plot Regret

In [None]:
plot_tribulation(create_image_tribulation('cifar-100-pilot', 'cifar100'))

## CIFAR-100 Orthogonal Exploration

In [None]:
create_image_tribulation('cifar-100-orthog', 'cifar100').head()