### Amortized Deletion Experiments

In [3]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cufflinks as cf
import plotly.offline

from IPython.display import display, HTML
from ipywidgets import interactive_output, HBox, VBox, Layout 
from ipywidgets import Dropdown, Checkbox, ToggleButtons, SelectionRangeSlider, SelectionSlider
import ipywidgets as widgets

cf.go_offline()
pd.set_option('display.max_columns', 100)

experiment = 'deletion'

data_dir = '../../'
df = pd.read_csv(os.path.join(data_dir, 'results_{}.csv'.format(experiment)))
df.head()

Unnamed: 0,dataset,model_type,criterion,adversary,rs,n_estimators,max_depth,lmbda,topd,min_support,epsilon,method,train_time,amortized,amortized_worst_case,speedup_vs_naive,auc,acc,bacc,ap,auc_diff_avg,auc_diff_std,acc_diff_avg,acc_diff_std,bacc_diff_avg,bacc_diff_std,ap_diff_avg,ap_diff_std,num_retrains,avg_retrain_depth,percent_complete,n_nodes_avg,n_exact_avg,n_semi_avg
0,surgical,forest,gini,random,1,100,1,-1.0,-1,-1,0.0,naive,0.481645,0.386931,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,1.0,-1.0,-1.0,-1.0
1,surgical,forest,gini,random,1,100,1,-1.0,-1,-1,0.0,exact,0.230117,0.002833,0.004777,136.603896,0.764379,0.749915,0.5,0.508783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0.0,1.0,-1.0,-1.0,-1.0
2,surgical,forest,gini,random,1,100,1,1e-08,1,2500,0.01,cedar,0.211897,0.003348,0.005103,115.579381,0.782852,0.749915,0.5,0.557423,-0.018108,6.7e-05,0.0,0.0,0.0,0.0,-0.047766,0.000158,0,-1.0,1.0,3.0,0.0,1.0
3,surgical,forest,gini,random,1,100,1,0.0001,1,2500,0.01,cedar,0.213165,0.012069,0.016279,32.060301,0.782636,0.749915,0.5,0.559613,0.000663,0.005868,0.0,0.0,0.0,0.0,0.005265,0.014775,3574,0.0,1.0,3.0,0.0,1.0
4,surgical,forest,gini,random,1,100,1,0.1,1,2500,0.01,cedar,0.184732,0.016124,0.01852,23.996838,0.764974,0.749915,0.5,0.508851,0.001567,0.000667,0.0,0.0,0.0,0.0,0.007086,0.003446,5785,0.0,1.0,3.0,0.0,1.0


In [4]:
# plot options
x = Dropdown(description='x', options=['topd', 'max_depth'])
x_scale = Checkbox(description='log scale (x-axis)', value=False)
metric = SelectionSlider(description='Metric', options=['auc', 'acc', 'bacc', 'ap'], value='auc')
categories = Dropdown(description='Categories', options=['lmbda', 'topd', 'max_depth'])
naive = Checkbox(description='naive', value=True)
exact = Checkbox(description='exact (lambda=-1)', value=True)
retrain_depth = Checkbox(description='retrain depths', value=True)

# experiment options
dataset = Dropdown(description='Dataset', options=['surgical', 'adult', 'bank_marketing', 'flight_delays', 'diabetes',
                                                   'olympics', 'census', 'credit_card', 'synthetic', 'higgs'])
criterion = ToggleButtons(description='Criterion', options=['gini', 'entropy'])
adversary = ToggleButtons(description='Adversary', options=['random', 'root'], value='random')
rs = SelectionSlider(description='rs', options=[1])

# hyperparameter options
trees = SelectionSlider(description='No. trees', options=[10, 100, 250, 500], value=100)
depth = SelectionRangeSlider(description='Max depth', options=[1, 3, 5, 10, 20], index=(4, 4))
topd = SelectionRangeSlider(description='Top d', options=[1, 2, 3, 4, 5], index=(0, 4))
min_support = SelectionSlider(description='Min Support', options=[2500], value=2500)
lmbda = SelectionRangeSlider(description='Lambda', options=[1e-10, 1e-8, 1e-6, 1e-4, 1e-1], index=(1, 1))
epsilon = SelectionRangeSlider(description='Epsilon', options=[0.01, 0.1, 1.0], index=(2, 2))

# create ui
box_1 = VBox([x, categories, x_scale, naive, exact, retrain_depth])
box_2 = VBox([dataset, criterion, adversary, metric, rs])
box_3 = VBox([trees, depth, topd, min_support, lmbda, epsilon])
ui = HBox([box_2, box_3, box_1])

# plot graphs
def f(x, x_scale, metric, categories,
      dataset, criterion, adversary, rs,
      trees, depth, topd, min_support, lmbda, epsilon,
      naive, exact, retrain_depth):
    
    # filter results
    temp = df.copy()
    temp = temp[temp['dataset'] == dataset]
    temp = temp[temp['criterion'] == criterion]
    temp = temp[temp['adversary'] == adversary]
    temp = temp[temp['n_estimators'] == trees]
    temp = temp[(temp['max_depth'] >= depth[0]) & (temp['max_depth'] <= depth[1])]

    naive_df = temp[temp['method'] == 'naive']
    exact_df = temp[temp['method'] == 'exact']

    temp = temp[(temp['topd'] >= topd[0]) & (temp['topd'] <= topd[1])]
    temp = temp[temp['min_support'] == min_support]
    temp = temp[(temp['lmbda'] >= lmbda[0]) & (temp['lmbda'] <= lmbda[1])]
    temp = temp[(temp['epsilon'] >= epsilon[0]) & (temp['epsilon'] <= epsilon[1])]
    temp = temp[temp['rs'] == rs]

    # plot results
    if len(temp) > 0:
        
        y = ['amortized', 'speedup_vs_naive', metric, '{}_diff_avg'.format(metric),
            'percent_complete', 'num_retrains']
        titles = ['Absolute efficiency (lower -> better)', 'Relative efficiency (higher -> better)',
                  'Absolute utility (higher -> better)', 'Relative utility (lower -> better)',
                  'Completed deletions', 'Retrains']
        
        if retrain_depth:
            y[-2] = 'avg_retrain_depth'
            titles[-2] = 'Retrain depth'

        nrows, ncols = 2, 3
        fig = plt.figure(figsize=(15, 4.5), constrained_layout=True)
        gs = gridspec.GridSpec(nrows=nrows, ncols=ncols, figure=fig)
        axs = [fig.add_subplot(gs[j, i]) for i in range(ncols) for j in range(nrows)]

        for i, ax in enumerate(axs):
            ax.set_xlabel(x)
            ax.set_ylabel(y[i])
            ax.set_title(titles[i])

        if x_scale:
            for ax in axs:
                ax.set_xscale('log')

        colors = ['green', 'orange', 'magenta', 'purple', 'cyan', 'blue']
        colors2 = ['brown', 'black', 'red']
        lines2 = ['--', ':', '-.']
        labels2 = ['naive', 'exact', 'random']
        markersize = 120
        alpha = 0.75

        for i, (label, gf) in enumerate(temp.groupby(categories)):
            for j, ax in enumerate(axs):
                ax.plot(gf[x], gf[y[j]], label=label, color=colors[i], alpha=alpha, marker='o')

        if naive and len(naive_df) > 0:
            for i in [0, 1]:
                if x == 'topd':
                    axs[i].axhline(naive_df.iloc[0][y[i]], label=labels2[0], color=colors2[0],
                                   alpha=alpha, marker='o', linestyle=lines2[0])
                else:
                    axs[i].plot(naive_df[x], naive_df[y[i]], label='naive', color='brown',
                                alpha=alpha, marker='o', linestyle='--')
            
        if exact and len(exact_df) > 0:
            for i, ax in enumerate(axs):
                if x == 'topd':
                    axs[i].axhline(exact_df.iloc[0][y[i]], label=labels2[1], color=colors2[1],
                                   alpha=alpha, marker='o', linestyle=lines2[1])
                else:
                    axs[i].plot(exact_df[x], exact_df[y[i]], label='exact', color='black',
                                alpha=alpha, marker='o', linestyle=':')
            
        axs[0].legend(title=categories)
    
    else:
        plt.clf()

# dictionaries
dataset_dict = {'surgical': (100, (4, 4), 'acc'), 'adult': (10, (4, 4), 'acc'), 'bank_marketing': (100, (3, 3), 'auc'),
                'flight_delays': (250, (4, 4), 'auc'), 'diabetes': (250, (4, 4), 'acc'), 'olympics': (250, (4, 4), 'auc'),
                'census': (250, (4, 4), 'auc'), 'credit_card': (250, (4, 4), 'ap'), 'synthetic': (250, (4, 4), 'acc'),
                'higgs': (100, (3, 3), 'acc')}

def update_trees(*args):
    trees.value = dataset_dict[dataset.value][0]
    metric.value = dataset_dict[dataset.value][2]
    if x.value == 'topd':
        depth.index = dataset_dict[dataset.value][1]

def update_categories(*args):
    if x.value == 'max_depth':
        categories.value = 'topd'
        depth.index = (0, 4)
    
    elif x.value == 'topd':
        categories.value = 'lmbda'
        depth.index = dataset_dict[dataset.value][1]

dataset.observe(update_trees, names='value')
x.observe(update_categories, names='value')

out = interactive_output(f, {'x': x, 'x_scale': x_scale, 'metric': metric, 'categories': categories,
                             'dataset': dataset, 'criterion': criterion, 'adversary': adversary, 'rs': rs,
                             'trees': trees, 'depth': depth, 'topd': topd, 'min_support': min_support,
                             'lmbda': lmbda, 'epsilon': epsilon, 'naive': naive, 'exact': exact,
                             'retrain_depth': retrain_depth})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Dataset', options=('surgical', 'adult', 'bank_marketing', …

Output()