### Worst-Case Adversary Analysis

In [106]:
import os
from itertools import cycle
from copy import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cufflinks as cf
import plotly.offline

from IPython.display import display, HTML
from ipywidgets import interactive_output, HBox, VBox, Layout 
from ipywidgets import Dropdown, Checkbox, ToggleButtons, SelectionRangeSlider, SelectionSlider
import ipywidgets as widgets

cf.go_offline()
pd.set_option('display.max_columns', 100)

in_dir = '../../temp_worst_adversary/'

def get_results(in_dir, dataset='surgical', operation='delete',
                model_type='forest', criterion='gini', random_state=1,
                n_estimators=250, max_depth=20, subsample_size=1,
                topd=0, min_support=2, keep_frac=0.0):

    fp = os.path.join(in_dir, dataset, operation, model_type, criterion, 'rs_{}'.format(random_state),
                      'trees_{}'.format(n_estimators), 'depth_{}'.format(max_depth), 'sub_{}'.format(subsample_size),
                      'keep_{}'.format(keep_frac), 'topd_{}'.format(topd), 'support_{}'.format(min_support),
                      'results.npy')
    
    if os.path.exists(fp):
        results = list(np.load(fp, allow_pickle=True)[()])
        df = pd.DataFrame(results)
        df['cum_search_time'] = df['search_time'].cumsum()
        df['cum_delete_time'] = df['delete_time'].cumsum()
        df['cum_sample_cost'] = df['sample_cost'].cumsum()
        df['avg_search_time'] = df['search_time'].rolling(window=50).mean()
        df['avg_delete_time'] = df['delete_time'].rolling(window=50).mean()
        df['avg_sample_cost'] = df['sample_cost'].rolling(window=50).mean()
    else:
        df = None

    return df

In [108]:
def f(dataset, n_estimators, max_depth, topd, keep_frac, log_scale, analysis):
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))

    prefix = ''
    if analysis == 'cumulative':
        prefix = 'cum_'
    elif analysis == 'average':
        prefix = 'avg_'

    for sub_size in [1, 10, 100, 1000]:
        df = get_results(in_dir=in_dir, dataset=dataset, n_estimators=n_estimators,
                         max_depth=max_depth, topd=topd, subsample_size=sub_size,
                         keep_frac=keep_frac)

        if df is None:
            continue

        ax = axs[0]
        ax.plot(df.index, df['{}delete_time'.format(prefix)], label='sub_size={}'.format(sub_size))
        ax.set_xlabel('no. samples')
        ax.set_ylabel('{}deletion_time (s)'.format(prefix))
        ax.set_title('Deletion Time')
        ax.legend()
        if log_scale:
            ax.set_yscale('log')

        ax = axs[1]
        ax.plot(df.index, df['{}sample_cost'.format(prefix)])
        ax.set_xlabel('no. samples')
        ax.set_ylabel('{}sample_cost (#)'.format(prefix))
        ax.set_title('Sample Cost')
        if log_scale:
            ax.set_yscale('log')

        ax = axs[2]
        ax.plot(df.index, df['{}search_time'.format(prefix)])
        ax.set_xlabel('no.samples')
        ax.set_ylabel('{}search_time (s)'.format(prefix))
        ax.set_title('Search')
        if log_scale:
            ax.set_yscale('log')

    plt.tight_layout()
    plt.show()

# widgets
dataset = Dropdown(description='Dataset', options=['surgical', 'vaccine', 'adult',
                                                   'bank_marketing', 'flight_delays', 'diabetes',
                                                   'olympics', 'census', 'credit_card',
                                                   'synthetic', 'higgs'])
n_estimators = SelectionSlider(description='No. trees', options=[10, 100, 250], value=250)
max_depth = SelectionSlider(description='Max depth', options=[1, 3, 5, 10, 20], value=20)
subsample_size = SelectionRangeSlider(description='Sample size', options=[1, 10, 100, 1000], index=(0, 0))
topd = SelectionSlider(description='Topd', options=list(range(20)), value=0)
keep_frac = SelectionSlider(description='Keep frac', options=[0.0, 0.1], value=0.0)
log_scale = Checkbox(description='Log scale', value=False)
analysis = SelectionSlider(description='Analysis', options=['raw', 'cumulative', 'average'], value='raw')

dataset_dict = {'surgical': (250, 20), 'vaccine': (250, 10), 'adult': (10, 20),
                'bank_marketing': (100, 10), 'flight_delays': (250, 20), 'diabetes': (250, 20),
                'olympics': (250, 20), 'census': (250, 20), 'credit_card': (250, 20),
                'synthetic': (250, 20), 'higgs': (100, 10)}

# create UI
box_1 = VBox([dataset, n_estimators, max_depth])
box_2 = VBox([topd, keep_frac])
box_3 = VBox([analysis, log_scale])
ui = HBox([box_1, box_2, box_3])

def update_parameters(*args):
    n_estimators.value = dataset_dict[dataset.value][0]
    max_depth.value = dataset_dict[dataset.value][1]

dataset.observe(update_parameters, names='value')

out = interactive_output(f, {'dataset': dataset, 'n_estimators': n_estimators, 'max_depth': max_depth,
                             'topd': topd, 'keep_frac': keep_frac, 'log_scale': log_scale,
                             'analysis': analysis})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Dataset', options=('surgical', 'vaccine', 'adult', 'bank_m…

Output()