In [None]:
import re
from collections import namedtuple
from datetime import datetime
from os import listdir
from os.path import join as join_path, exists as file_exists, realpath, expanduser, basename, splitext

import numpy as np
import pandas as pd
from bokeh.models import HoverTool, CDSView, BooleanFilter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.resources import INLINE
from bokeh.io import output_notebook, show

from classifiers import bucket_alist
from colors import from_file as color_from_file
from images import from_file as image_from_file, NeuralNetwork
from trial_dataframe import create_tribulation

output_notebook(resources=INLINE)

## Functions

In [None]:
def create_color_tribulation(directory):
    df = create_tribulation(directory)
    regex = 'colors(?P<num_centroids>[0-9]*)_'
    regex += 's(?P<random_seed>[0-9.]*)'
    regex += 'n(?P<dataset_size>[0-9]*)'
    regex += 'k(?P<num_colors>[0-9]*)'
    df['regex'] = df['trial_id'].apply(
        lambda s: re.match(regex, s)
    )
    for attr in ['random_seed', 'num_centroids', 'dataset_size', 'num_colors']:
        df[attr] = df['regex'].apply(lambda match: match.group(attr))
        if attr != 'random_seed':
            df[attr] = df[attr].astype(int)
    del df['regex']
    return df

def create_image_tribulation(directory):
    from images import NeuralNetwork
    df = create_tribulation(directory)
    df['neural_network'] = df['trial_id'].apply(
        lambda s: NeuralNetwork(join_path(directory, s.split('_')[0] + '.hdf5'))
    )
    df['int_labels'] = df['neural_network'].apply(lambda nn: str(nn.int_labels))
    df['batch_size'] = df['neural_network'].apply(lambda nn: int(nn.batch_size))
    df['num_epochs'] = df['neural_network'].apply(lambda nn: int(nn.num_epochs))
    del df['neural_network']
    return df

def filter_improvements(df):
    new_df = pd.DataFrame()
    for int_labels in df['int_labels'].unique():
        label_df = df[df['int_labels'] == int_labels]
        improvement = label_df[['num_epochs', 'true_positive_init']].drop_duplicates().sort_values('num_epochs')
        epochs = set([improvement.iloc[0]['num_epochs']])
        last_accuracy = 0
        for row in improvement.itertuples():
            if row.true_positive_init > last_accuracy:
                epochs.add(row.num_epochs)
                last_accuracy = row.true_positive_init
        new_df = pd.concat([new_df, df[(df['int_labels'] == int_labels) & (df['num_epochs'].isin(epochs))]])
    return new_df

def plot_tribulation_roc(source):
    roc_fig = figure(
        width=400, height=400,
        x_axis_label='Proportion of All Data Searched',
        y_axis_label='Proportion of Label Data Found',
        tools=['box_select'],
    )
    roc_fig.line(
        x='searched',
        y='found',
        selection_color='red',
        source=source,
    )
    return roc_fig

def plot_tribulation_regret(source):
    if not isinstance(source, ColumnDataSource):
        source = ColumnDataSource(source)
    regret_fig = figure(
        width=400, height=400,
        x_range=[0, 1.1],
        x_axis_label='Classifier Test Accuracy',
        y_range=[0, 1.1],
        y_axis_label='Regret (lower is better)',
        tools=['box_select'],
    )
    renderer = regret_fig.x(
        x='true_positive_init',
        y='mean_regret_scaled',
        selection_color='red',
        source=source,
    )
    regret_fig.add_tools(HoverTool(renderers=[renderer], tooltips=[
        ('trial', '@trial_id'),
        ('accuracy', '@true_positive_init'),
        ('regret', '@mean_regret_scaled'),
    ]))
    return regret_fig

def plot_tribulation(df):
    source = ColumnDataSource(df)
    roc_fig = plot_tribulation_roc(source)
    regret_fig = plot_tribulation_regret(source)
    figures = [roc_fig, regret_fig]
    return gridplot([figures])

## Color Domain

### Regret vs. Data Size

In [None]:
def plot_color_regret():
    # load data
    df = create_color_tribulation('colors')
    # group by independent variables
    plot_df = df[[
        'trial_id',
        'random_seed', 'num_centroids', 'dataset_size', 'num_colors',
        'mean_regret_scaled',
    ]].drop_duplicates().groupby([
        'num_centroids', 'dataset_size', 'num_colors'
    ]).mean()['mean_regret_scaled'].reset_index()

    plot_df['num_centroids'] = plot_df['num_centroids'].astype(int)
    plot_df['dataset_size'] = plot_df['dataset_size'].astype(int)
    plot_df['num_colors'] = plot_df['num_colors'].astype(int)
    plot_df = plot_df[['num_centroids', 'dataset_size', 'num_colors', 'mean_regret_scaled']].drop_duplicates()
    # build grid plot
    grid = []
    source = ColumnDataSource(plot_df)
    for r, num_centroids in enumerate(reversed([10, 20, 50, 100])):
        row = []
        for c, num_new_labels in enumerate([20, 50, 100, 200]):
            if num_new_labels <= num_centroids:
                row.append(None)
                continue
            fig = figure(
                width=200, height=200,
                title='{} + {} = {}'.format(num_centroids, num_new_labels - num_centroids, num_new_labels),
                x_axis_type='log',
                x_range=[90, 1100000],
                y_range=[0, 0.025],
            )
            fig.x(
                x='dataset_size',
                y='mean_regret_scaled',
                source=source,
                view=CDSView(
                    source=source,
                    filters=[BooleanFilter(plot_df.apply(
                        (lambda row: (row['num_centroids'] == num_centroids) & (row['num_colors'] == num_new_labels)),
                        axis=1,
                    ))],
                ),
            )
            row.append(fig)
        grid.append(row)
    return gridplot(grid)

show(plot_color_regret())

## CIFAR-10, Three Labels Complete Sweep with History

### Regret vs. Accuracy

In [None]:
show(plot_tribulation_regret(filter_improvements(create_image_tribulation('cifar10-threes-history-new'))))

In [None]:
filter_improvements(create_image_tribulation('cifar10-threes-history-new')).corr()

### Plot regret after training

In [None]:
def plot_accuracy_over_training():
    df = create_image_tribulation('cifar10-threes-history-new')
    df = filter_improvements(df)
    fig = figure(
        width=400, height=400,
        title='Accuracy During Training',
        x_axis_label='Epoch',
        y_axis_label='Accuracy',
        y_range=[0, 1.05],
    )
    for int_labels in df['int_labels'].unique():
        trial_df = df[df['int_labels'] == int_labels]
        trial_df = trial_df[['num_epochs', 'true_positive_init']].drop_duplicates()
        fig.line(
            x='num_epochs',
            y='true_positive_init',
            source=ColumnDataSource(trial_df),
        )
    fig.line(
        x='num_epochs',
        y='true_positive_init',
        color='#CC0000',
        line_width=5,
        source=ColumnDataSource(df[['int_labels', 'num_epochs', 'true_positive_init']].drop_duplicates().pivot_table(
            index='num_epochs',
            values='true_positive_init', 
            aggfunc=np.mean,
        )),
    )
    return fig
    
def plot_regret_over_training():
    df = create_image_tribulation('cifar10-threes-history-new')
    df = filter_improvements(df)
    fig = figure(
        width=400, height=400,
        title='Regret During Training',
        x_axis_label='Epoch',
        y_axis_label='Regret',
        y_range=[0, 1.05],
    )
    for int_labels in df['int_labels'].unique():
        trial_df = df[df['int_labels'] == int_labels]
        trial_df = trial_df[['num_epochs', 'mean_regret_scaled']].drop_duplicates()
        fig.line(
            x='num_epochs',
            y='mean_regret_scaled',
            source=ColumnDataSource(trial_df),
        )
    fig.line(
        x='num_epochs',
        y='mean_regret_scaled',
        color='#CC0000',
        line_width=5,
        source=ColumnDataSource(df[['int_labels', 'num_epochs', 'mean_regret_scaled']].drop_duplicates().pivot_table(
            index='num_epochs',
            values='mean_regret_scaled', 
            aggfunc=np.mean,
        )),
    )
    return fig

show(gridplot([[plot_accuracy_over_training(), plot_regret_over_training()]]))

## CIFAR-100 Pilot

### Preview Data

### Plot Regret

## CIFAR-100 Orthogonal Exploration