In [None]:
import re
from collections import namedtuple
from datetime import datetime
from os import listdir
from os.path import join as join_path, exists as file_exists, realpath, expanduser, basename, splitext

import pandas as pd
from bokeh.models import HoverTool, CDSView, BooleanFilter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.resources import INLINE
from bokeh.io import output_notebook, show

from classifiers import bucket_alist
from colors import from_file as color_from_file
from images import from_file as image_from_file, NeuralNetwork

output_notebook(resources=INLINE)

## Functions

In [None]:
def bucket_to_dict(bucketed_list):
    result = {}
    for i, (_, classes) in enumerate(sorted(bucketed_list)):
        for cls in classes:
            result[cls] = i
    return result

OldLabelData = namedtuple('OldLabelData', [
    'old_class',
    'old_label',
    'old_label_size',
    'label_true_positive_init',
    'label_true_positive_update',
])

TRIAL_COLUMNS = [
    # trial information
    'trial_id',
    'mean_regret',
    'mean_regret_scaled',
    'true_positive_init',
    'true_positive_update',
    # old label information
    'old_class',
    'old_label',
    'old_label_size',
    'label_true_positive_init',
    'label_true_positive_update',
    # new label information
    'new_class',
    'new_label',
    'label_mean_regret',
    'label_max_regret',
    'label_mean_regret_scaled',
    # ranking information
    'heuristic',
    'heuristic_rank',
    'misclassification',
    'misclassification_rank',
]

TrialRow = namedtuple('TrialRow', TRIAL_COLUMNS)


def trial_to_dataframe(trial):
    # trial information
    trial_id = trial.get_persistent_id()
    mean_regret = trial.mean_regret()
    mean_regret_scaled = trial.mean_regret_scaled()
    true_positive_init = trial.true_positive_init()
    true_positive_update = trial.true_positive_update()
    # pre-calculate duplicated old label information
    old_label_data = {}
    for old_class in trial.old_ys():
        old_label_data[old_class] = OldLabelData(
            old_class=old_class,
            old_label=trial.domain_utils.class_to_label(old_class),
            old_label_size=sum(trial.load_summary()[old_class].values()),
            label_true_positive_init=trial.label_true_positive_init(old_class),
            label_true_positive_update=trial.label_true_positive_update(old_class),
        )
    # loop through new labels and create dataframe rows
    trial_df_rows = []
    for new_class in trial.new_ys():
        # new class information
        new_class = new_class
        new_label = trial.domain_utils.class_to_label(new_class)
        label_mean_regret = trial.label_mean_regret(new_class)
        label_max_regret = trial.label_max_regret(new_class)
        label_mean_regret_scaled = trial.label_mean_regret_scaled(new_class)
        # ranking information
        misclassifications = dict(trial.label_misclassification_order(new_class))
        heuristics = dict(trial.label_distance_order(new_class))
        misclassification_ranks = bucket_to_dict(bucket_alist(misclassifications))
        heuristic_ranks = bucket_to_dict(bucket_alist(heuristics))
        # cross old labels and calculate heuristic information
        for old_class in old_label_data.keys():
            trial_df_rows.append(TrialRow(
                # trial information
                trial_id=trial_id,
                mean_regret=mean_regret,
                mean_regret_scaled=mean_regret_scaled,
                true_positive_init=true_positive_init,
                true_positive_update=true_positive_update,
                # old label information
                old_class=old_class,
                old_label=old_label_data[old_class].old_label,
                old_label_size=old_label_data[old_class].old_label_size,
                label_true_positive_init=old_label_data[old_class].label_true_positive_init,
                label_true_positive_update=old_label_data[old_class].label_true_positive_update,
                # new label information
                new_class=new_class,
                new_label=new_label,
                label_mean_regret=label_mean_regret,
                label_max_regret=label_max_regret,
                label_mean_regret_scaled=label_mean_regret_scaled,
                # ranking information
                heuristic=heuristic_ranks[old_class],
                heuristic_rank=heuristics[old_class],
                misclassification=misclassifications[old_class],
                misclassification_rank=misclassification_ranks[old_class],
            ))
    return pd.DataFrame(trial_df_rows, columns=TRIAL_COLUMNS)


def create_tribulation(directory):
    """Convert a collection of trials (a "tribulation") to a dataframe.

    Since what we are interested in is how the new classes are distributed
    among the old classes, each trial will contribute:

        |old_labels| * |new_labels|

    rows to the final dataframe. To make plotting easier, some information will
    be duplicated amongst these rows. In addition to trial-level data (which
    will be the same for all rows from a trial, information specific to the old
    label (label true positive rate at both the initialization and update
    stages) and to the new label (the mean and max regret) will also be
    duplicated.
    """
    directory = realpath(expanduser(directory))
    tribulation_file = join_path(directory, basename(directory) + '.tribulation')
    if file_exists(tribulation_file):
        return pd.read_csv(tribulation_file)
    if basename(directory).startswith('color'):
        from_file = color_from_file
    elif basename(directory).startswith('cifar'):
        from_file = image_from_file
    else:
        raise ValueError('Cannot determine domain for directory {}'.format(directory))
    trial_dfs = {}
    for i, filename in enumerate(listdir(directory)):
        trial_id = splitext(filename)[0]
        if len(trial_id.split('_')) != 2:
            continue
        if trial_id in trial_dfs:
            continue
        print(i, datetime.now().isoformat(), trial_id)
        trial_df_file = join_path(directory, trial_id + '.trial_df')
        if file_exists(trial_df_file):
            trial_df = pd.read_csv(trial_df_file)
        else:
            trial = from_file(join_path(directory, filename))
            trial_df = trial_to_dataframe(trial)
            trial_df.to_csv(trial_df_file, index=False)
        trial_dfs[trial_id] = trial_df
    tribulation_df = pd.concat(trial_dfs.values())
    tribulation_df.to_csv(tribulation_file, index=False)
    return tribulation_df

def create_color_tribulation(directory):
    df = create_tribulation(directory)
    regex = 'colors(?P<num_centroids>[0-9]*)_'
    regex += 's(?P<random_seed>[0-9.]*)'
    regex += 'n(?P<dataset_size>[0-9]*)'
    regex += 'k(?P<num_colors>[0-9]*)'
    df['regex'] = df['trial_id'].apply(
        lambda s: re.match(regex, s)
    )
    for attr in ['random_seed', 'num_centroids', 'dataset_size', 'num_colors']:
        df[attr] = df['regex'].apply(lambda match: match.group(attr))
        if attr != 'random_seed':
            df[attr] = df[attr].astype(int)
    del df['regex']
    return df

def create_image_tribulation(directory):
    from images import NeuralNetwork
    df = create_tribulation(directory)
    df['neural_network'] = df['trial_id'].apply(
        lambda s: NeuralNetwork(join_path(directory, s.split('_')[0] + '.hdf5'))
    )
    df['int_labels'] = df['neural_network'].apply(lambda nn: nn.int_labels)
    df['batch_size'] = df['neural_network'].apply(lambda nn: nn.batch_size)
    df['num_epochs'] = df['neural_network'].apply(lambda nn: nn.num_epochs)
    del df['neural_network']
    return df

def plot_tribulation_roc(source):
    roc_fig = figure(
        width=400, height=400,
        x_axis_label='Proportion of All Data Searched',
        y_axis_label='Proportion of Label Data Found',
        tools=['box_select'],
    )
    roc_fig.line(
        x='searched',
        y='found',
        selection_color='red',
        source=source,
    )
    return roc_fig

def plot_tribulation_regret(source):
    if not isinstance(source, ColumnDataSource):
        source = ColumnDataSource(source)
    regret_fig = figure(
        width=400, height=400,
        x_range=[0, 1.1],
        x_axis_label='Classifier Test Accuracy',
        y_range=[0, 1.1],
        y_axis_label='Regret (lower is better)',
        tools=['box_select'],
    )
    renderer = regret_fig.x(
        x='true_positive_init',
        y='mean_regret_scaled',
        selection_color='red',
        source=source,
    )
    regret_fig.add_tools(HoverTool(renderers=[renderer], tooltips=[
        ('trial', '@trial_id'),
        ('accuracy', '@true_positive_init'),
        ('regret', '@mean_regret_scaled'),
    ]))
    return regret_fig

def plot_tribulation(df):
    source = ColumnDataSource(df)
    roc_fig = plot_tribulation_roc(source)
    regret_fig = plot_tribulation_regret(source)
    figures = [roc_fig, regret_fig]
    return gridplot([figures])

## Color Domain

### Load Data

In [None]:
create_color_tribulation('colors').head()

### Plot Regret

    grid of
        num_centroids: 10, 20, 50, 100
        num_new_labels: 20, 50, 100, 200
        with the constraint that num_colors > num_centroids
    num_colors: 1000, 10000, 100000, 1000000

In [None]:
def plot_color_regret():
    # load data
    df = create_color_tribulation('colors')
    # group by independent variables
    plot_df = df[[
        'trial_id',
        'random_seed', 'num_centroids', 'dataset_size', 'num_colors',
        'mean_regret_scaled',
    ]].drop_duplicates().groupby([
        'num_centroids', 'dataset_size', 'num_colors'
    ]).mean()['mean_regret_scaled'].reset_index()

    plot_df.shape
    plot_df['num_centroids'] = plot_df['num_centroids'].astype(int)
    plot_df['dataset_size'] = plot_df['dataset_size'].astype(int)
    plot_df['num_colors'] = plot_df['num_colors'].astype(int)
    # build grid plot
    grid = []
    source = ColumnDataSource(plot_df)
    for r, num_centroids in enumerate(reversed([10, 20, 50, 100])):
        row = []
        for c, num_new_labels in enumerate([20, 50, 100, 200]):
            if num_new_labels <= num_centroids:
                row.append(None)
                continue
            fig = figure(
                width=200, height=200,
                title='{} + {} = {}'.format(num_centroids, num_new_labels - num_centroids, num_new_labels),
                x_axis_type='log',
                x_range=[90, 1100000],
                y_range=[0, 0.025],
            )
            fig.x(
                x='dataset_size',
                y='mean_regret_scaled',
                source=source,
                view=CDSView(
                    source=source,
                    filters=[BooleanFilter(plot_df.apply(
                        (lambda row: (row['num_centroids'] == num_centroids) & (row['num_colors'] == num_new_labels)),
                        axis=1,
                    ))],
                ),
            )
            row.append(fig)
        grid.append(row)
    return gridplot(grid)

show(plot_color_regret())

## CIFAR-10, Three Labels Complete Sweep

### Load Data

In [None]:
create_image_tribulation('cifar10-threes').head()

### Plot Regret

In [None]:
show(plot_tribulation_regret(create_image_tribulation('cifar10-threes')))

## CIFAR-100 Pilot

### Load Data

In [None]:
create_image_tribulation('cifar100-pilot').head()

### Plot Regret

In [None]:
show(plot_tribulation_regret(create_image_tribulation('cifar100-pilot')))

## CIFAR-100 Orthogonal Exploration

In [None]:
create_image_tribulation('cifar100-orthog').head()

In [None]:
def plot_cifar100_orthog():
    tribulation = create_image_tribulation('cifar100-orthog')
    labels = list(range(5, 30, 5))
    epochs = list(range(200, 800, 200))
    # sort the trials
    networks = {}
    for trial_id in tribulation['trial_id'].unique():
        classifier_str = trial_id.split('_')[0]
        path = join_path('cifar100-orthog', classifier_str + '.hdf5')
        nn = NeuralNetwork(path)
        key = (len(nn.int_labels), nn.num_epochs)
        networks.setdefault(key, set()).add(trial_id)


    default_labels = 10
    default_epoch = 200

    label_plots = []
    for num_labels in sorted(labels):
        key = (num_labels, default_epoch)
        plot_df = tribulation[tribulation['trial_id'].apply(
            lambda s: any(s.startswith(prefix) for prefix in networks[key])
        )]
        label_plots.append(plot_tribulation_regret(plot_df))
    show(gridplot([label_plots]))

    epoch_plots = []
    for num_epochs in sorted(epochs):
        key = (default_labels, num_epochs)
        plot_df = tribulation[tribulation['trial_id'].apply(
            lambda s: any(s.startswith(prefix) for prefix in networks[key])
        )]
        epoch_plots.append(plot_tribulation_regret(plot_df))
    show(gridplot([epoch_plots]))

plot_cifar100_orthog()