# prologue

### set up notebook and load package

In [None]:
# load what we need
import numpy as np
import CHIRPS.datasets as ds
import CHIRPS.datasets_proprietary as dsp
import CHIRPS.reproducible as rp
import multiprocessing as mp

## Sensitivity Analysis

In [None]:
# Optional Memory and Computation Cost Management
# CHIRPS is time economical but memory intensive to compute for lots of instances at once.
forest_walk_async=True
explanation_async=True
n_cores = mp.cpu_count()-2

# How many instances from the test set do you want to explain?
# A number bigger than the test set will be interpreted as 'all'
n_instances = 100
start_instance = 0 # here can opt to start at a specific instance, diagnostic if something crashes

# model = 'RandomForest'
# model = 'AdaBoost1'
# model = 'AdaBoost2'
model = 'GBM'

# prepare data
datasets = [
        ds.adult,
        ds.bankmark,
        ds.car,
        ds.cardio,
        ds.credit,
        ds.german,
        ds.lending_tiny_samp,
        ds.nursery,
        ds.rcdv,
       ]

# location to save results
project_dir = '/datadisk/whiteboxing/2020/GBM'
# project_dir = 'V:\\whiteboxing\\2020\\GBM' # defaults to a directory "whiteboxing" in the working directory
# project_dir = 'C:\\Users\\Crutt\\Documents\\whiteboxing\\2020\\GBM'

random_state_splits = 123 # change this if you want to try different splits of the data into test / train
random_state_rf = 123 # change this if you want to try with different forest construction
random_state_exp = 123 # change this if you want to try with different runs of the explainer algorithm (affects bootstrap eval)

verbose = True

# CHIRPS default parameters - see papers for details
precis_threshold = 0.99
merging_bootstraps = 20 # how many training bootstraps to test improvement in growing rule?
pruning_bootstraps = 20 # how many training bootstraps to test deterioration in pruning rule?
delta = 0.2 # pruning deterioration tolerance paramater

tuning = {'override' : False}

if model == 'RandomForest':
    tuning.update({'grid' : {'n_estimators': [(i + 1) * 200 for i in range(8)], 'max_depth' : [32]}})
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=random_state_rf,
                                           random_state_splits=random_state_splits,
                                           start_instance=start_instance,
                                           verbose=verbose, n_cores=n_cores)

    alpha_paths = np.tile([0.9, 0.5, 0.1], 24)
    disc_path_bins = np.tile(np.repeat([4, 8], 3), 12)
    score_func = np.tile(np.repeat([5, 3, 1], 6), 4)
    support_paths = np.tile(np.repeat([0.3, 0.2], 18), 2)
    weighting = np.repeat(['kldiv', 'nothing'], 36)

    kwargs_grid = {k : {'alpha_paths' : ap, 'disc_path_bins' : dpb, 'disc_path_eqcounts' : False,
                        'score_func' : sf, 'weighting' : w, 'support_paths' : sp,
                        'precis_threshold' : precis_threshold,
                        'merging_bootstraps' : merging_bootstraps,
                        'pruning_bootstraps' : pruning_bootstraps,
                        'which_trees' : 'majority',
                        'delta' : delta} 
        for k, ap, dpb, sf, w, sp in zip(range(72), alpha_paths, disc_path_bins, score_func, weighting, support_paths)}
    
    
    for kwargs in kwargs_grid:
        bi_copy = rp.deepcopy(benchmark_items) # to avoid running down the internal counters
        control = {'method' : 'CHIRPS', 'model' : model,
                    'n_instances' : n_instances,
                    'random_state' : random_state_exp,
                    'kwargs' : kwargs_grid[kwargs],
                    'forest_walk_async' : forest_walk_async,
                    'explanation_async' : explanation_async,
                    'n_cores' : n_cores,
                    'save_sensitivity_path' : 'rf_sensitivity'}
        
        print('Sensitivity round: ' + str(kwargs))
        rp.do_benchmarking(bi_copy, verbose, **control)
        
elif model in ('AdaBoost1', 'AdaBoost2'):
    if model == 'AdaBoost1':
        algo = 'SAMME'
        save_sensitivity_path = 'ada1_sensitivity'
        support_paths = np.tile(np.tile([0.05, 0.02, 0.01], 16), 2)
        n_kwargs = 36
    else:
        algo = 'SAMME.R'
        save_sensitivity_path = 'ada2_sensitivity'
        support_paths = np.tile(np.tile([0.005, 0.002, 0.001], 16), 2)
        n_kwargs = 72
        
    max_depth = [i for i in range(1, 5)]
    tuning.update({'grid' : {'base_estimator' : [rp.DecisionTreeClassifier(max_depth=d) for d in max_depth],
                            'n_estimators': [(i + 1) * 200 for i in range(8)], 'algorithm': [algo]}})
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=123, random_state_splits=123,
                                           start_instance=start_instance,
                                           verbose=verbose, n_cores=n_cores)
    
    disc_path_bins = np.tile(np.tile(np.repeat([4, 8], 3), 6), 2)
    disc_path_eqcounts = np.tile(np.tile(np.repeat([True, False], 6), 6), 2)
    weighting = np.tile(np.repeat(['chisq', 'kldiv', 'nothing'], 12), 2)
    which_trees = np.repeat(['majority', 'conf_weighted'], 36)

    kwargs_grid = {k : {'paths_lengths_threshold' : 5, 'alpha_paths' : 0.0,
                        'disc_path_bins' : dpb, 'disc_path_eqcounts' : dpeq,
                        'score_func' : 1, 'weighting' : w, 'support_paths' : sp,
                        'merging_bootstraps' : merging_bootstraps,
                        'pruning_bootstraps' : pruning_bootstraps,
                        'which_trees' : wchtr,
                        'delta' : delta} 
    for k, dpb, dpeq, w, sp, wchtr \
                   in zip(range(n_kwargs), disc_path_bins, disc_path_eqcounts, weighting, support_paths, which_trees)}

    for kwargs in kwargs_grid: # range(39, 72): # 
        bi_copy = rp.deepcopy(benchmark_items) # to avoid running down the internal counters
        control = {'method' : 'CHIRPS', 'model' : model,
                    'n_instances' : n_instances,
                    'random_state' : random_state_exp,
                    'kwargs' : kwargs_grid[kwargs],
                    'forest_walk_async' : forest_walk_async,
                    'explanation_async' : explanation_async,
                    'n_cores' : n_cores,
                    'save_sensitivity_path' : save_sensitivity_path}
        
        print('Sensitivity round: ' + str(kwargs))
        rp.do_benchmarking(bi_copy, verbose, **control)

else: # GBM
    tuning.update({'grid' : {'subsample' : [(i + 1) * 0.25 for i in range(3)],
                    'n_estimators': [i * 200 for i in range(1, 9)],
                    'max_depth' : [i for i in range(1, 5)],
                    'learning_rate': np.full(4, 10.0)**[i for i in range(-3, 1)]}})
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=random_state_rf,
                                           random_state_splits=random_state_splits,
                                           start_instance=start_instance,
                                           verbose=verbose, n_cores=n_cores)

    support_paths = np.tile([0.05, 0.1, 0.15, 0.2], 4)
    disc_path_bins = np.tile(np.repeat([4, 8], 4), 4)
    disc_path_eqcounts = np.repeat([True, False], 8)

    kwargs_grid = {k : {'paths_lengths_threshold' : 5, 'alpha_paths' : 0.0,
                    'disc_path_bins' : dpb, 'disc_path_eqcounts' : dpeq,
                    'score_func' : 1, 'weighting' : 'kldiv',
                    'support_paths' : sp,
                    'merging_bootstraps' : merging_bootstraps,
                    'pruning_bootstraps' : pruning_bootstraps,
                    'which_trees' : 'targetclass',
                    'delta' : 0.2} 
        for k, dpb, dpeq, sp in zip(range(32), disc_path_bins, disc_path_eqcounts, support_paths)}
    
    for kwargs in kwargs_grid: 
        bi_copy = rp.deepcopy(benchmark_items) # to avoid running down the internal counters
        control = {'method' : 'CHIRPS', 'model' : model,
                    'n_instances' : n_instances,
                    'random_state' : random_state_exp,
                    'kwargs' : kwargs_grid[kwargs],
                    'forest_walk_async' : forest_walk_async,
                    'explanation_async' : explanation_async,
                    'n_cores' : n_cores,
                    'save_sensitivity_path' : 'rf_sensitivity'}

        print('Sensitivity round: ' + str(kwargs))
        rp.do_benchmarking(bi_copy, verbose, **control)
    
print('finished')