# prologue

### set up notebook and load package

In [3]:
# load what we need
import CHIRPS.datasets as ds
import CHIRPS.reproducible as rp

# CHIRPS default set up
merging_bootstraps = 20
pruning_bootstraps = 20
delta = 0.05

forest_walk_async=False
chirps_explanation_async=False

n_instances = 5

# model = 'RandomForest'
# model = 'AdaBoost1'
model = 'AdaBoost2'
# model = 'GBM'

do_Anchors = True
do_dfrgTrs = True

datasets = [rp.datasets[0]] # here can opt for just one, e.g. [rp.datasets[0]] (as an iterator)
start_instance = 0 # here can opt to start at a specific instance

project_dir = 'V:\\whiteboxing\\tests' # defaults to a directory "whiteboxing" in the working directory
# project_dir = 'C:\\Users\\Crutt\\Documents\\whiteboxing\\tests'
random_state_splits = 123 # change this if you want to try different splits of the data into test / train
random_state_rf = 123 # change this if you want to try with different forest construction
random_state_exp = 123 # change this if you want to try with different runs of the explainer algorithm (affects bootstrap eval)

verbose = True

tuning = {'grid' : None, 'override' : False}
if model == 'RandomForest':
    tuning.update({'grid' : None}) # defaults to n_trees [200, 400, ..., 1600]
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=random_state_rf,
                                           random_state_splits=random_state_splits,
                                           start_instance=start_instance, verbose=verbose)

    kwargs = {'support_paths' : 0.1, 'alpha_paths' : 0.5, 'disc_path_bins' : 4,
             'score_func' : 1, 'weighting' : 'chisq',
             'merging_bootstraps' : merging_bootstraps,
             'pruning_bootstraps' : pruning_bootstraps, 'delta' : delta}
 
    control = {'method' : 'CHIRPS', 'model' : model,
                'n_instances' : n_instances,
                'random_state' : random_state_exp,
                'kwargs' : kwargs,
                'forest_walk_async' : forest_walk_async,
                'chirps_explanation_async' : chirps_explanation_async}
    
    rp.do_benchmarking(benchmark_items, verbose, **control)
    
    if do_Anchors:
        control = {'method' : 'Anchors',
                   'model' : model,
                   'n_instances' : n_instances,
                   'random_state' : random_state_exp}
        rp.do_benchmarking(benchmark_items, verbose, **control)
        
    if do_dfrgTrs:
        control = {'method' : 'defragTrees',
                    'Kmax' : 1, 'restart' : 1, 'maxitr' : 1,
                    'model' : model,
                    'n_instances' : n_instances,
                    'random_state' : random_state_exp}
        rp.do_benchmarking(benchmark_items, verbose, **control)
            
elif model == "AdaBoost1":
    algo = 'SAMME'
    max_depth = [i for i in range(1, 5)]
    tuning.update({'grid' : {'base_estimator' : [rp.DecisionTreeClassifier(max_depth=d) for d in max_depth],
                            'n_estimators': [(i + 1) * 200 for i in range(8)], 'algorithm': [algo]}})
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=123, random_state_splits=123,
                                           start_instance=start_instance, verbose=verbose)
    
    kwargs = {'paths_lengths_threshold' : 5,
             'support_paths' : 0.01, 'alpha_paths' : 0.0,
             'disc_path_bins' : 4, 'disc_path_eqcounts' : True,
             'score_func' : 1, 'weighting' : 'kldiv',
             'which_trees' : 'majority',
             'merging_bootstraps' : merging_bootstraps,
             'pruning_bootstraps' : pruning_bootstraps, 'delta' : delta}
 
    control = {'method' : 'CHIRPS', 'model' : model,
                'n_instances' : n_instances,
                'random_state' : random_state_exp,
                'kwargs' : kwargs,
                'forest_walk_async' : forest_walk_async,
                'chirps_explanation_async' : chirps_explanation_async}
    
    rp.do_benchmarking(benchmark_items, verbose=True, **control)
    
elif model == 'AdaBoost2':
    algo = 'SAMME.R'
    max_depth = [i for i in range(1, 5)]
    tuning.update({'grid' : {'base_estimator' : [rp.DecisionTreeClassifier(max_depth=d) for d in max_depth],
                            'n_estimators': [(i + 1) * 200 for i in range(8)], 'algorithm': [algo]}})
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=123, random_state_splits=123,
                                           start_instance=start_instance, verbose=verbose)
    
    kwargs = {'paths_lengths_threshold' : 5,
                 'support_paths' : 0.01, 'alpha_paths' : 0.0,
                 'disc_path_bins' : 8, 'disc_path_eqcounts' : True,
                 'score_func' : 1, 'weighting' : 'kldiv',
                 'which_trees' : 'majority',
                 'merging_bootstraps' : merging_bootstraps,
                 'pruning_bootstraps' : pruning_bootstraps, 'delta' : delta}
    
    control = {'method' : 'CHIRPS', 'model' : model,
                'n_instances' : n_instances,
                'random_state' : random_state_exp,
                'kwargs' : kwargs,
                'forest_walk_async' : forest_walk_async,
                'chirps_explanation_async' : chirps_explanation_async}
    
    rp.do_benchmarking(benchmark_items, verbose=True, **control)
    
else: # GBM
    benchmark_items = rp.benchmarking_prep(datasets, model, tuning, project_dir,
                                           random_state=123, random_state_splits=123,
                                           start_instance=start_instance, verbose=verbose)

Preprocessing adult_small_samp data and model for adult_small_samp with random state = 123
Split data into main train-test and build forest
using previous tuning parameters


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Best OOB Accuracy Estimate during tuning: 0.8473
Best parameters:{'algorithm': 'SAMME.R', 'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'n_estimators': 200, 'random_state': 123}

Discretise data and train model for Anchors
using previous tuning parameters
Best OOB Accuracy Estimate during tuning: 0.8473
Best parameters:{'algorithm': 'SAMME.R', 'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None

  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


CHIRPS time elapsed: 7.5620 seconds
CHIRPS with async = False

Evaluating found explanations
Results saved to V:\whiteboxing\tests\adult_small_samp\AdaBoost2_CHIRPS_rnst_123
CHIRPS batch results eval time elapsed: 0.1730 seconds



In [None]:
# for notebook plotting
%matplotlib inline 

# load what we need
import time
import timeit
import numpy as np
import CHIRPS.structures as strcts
import CHIRPS.routines as rt
import CHIRPS.reproducible as rp
import CHIRPS.boosting_scratch as bs
from CHIRPS import p_count_corrected, if_nexists_make_dir, chisq_indep_test, entropy_corrected, contingency_test, confidence_weight
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from pydotplus import graph_from_dot_data

# demo datasets that ship with package. all from UCI unless stated otherwise
# import CHIRPS.datasets as ds
# ds.adult_data, ds.adult_samp_data, ds.adult_small_samp_data Large dataset ships with manageable sub samples
# ds.bankmark_data, ds.bankmark_samp_data
# ds.car_data
# ds.cardio_data this is the cardiotocography dataset
# ds.credit_data
# ds.german_data
# ds.lending_data, ds.lending_samp_data, ds.lending_small_samp_data, ds.lending_tiny_samp_data from Kaggle. see datasets_from_source file for links
# ds.nursery_data, ds.nursery_samp_data
# ds.rcdv_data, ds.rcdv_samp_data from US government see datasets_from_source file for links

### common config - can be ommitted if defaults are OK

In [None]:
project_dir = 'V:\\whiteboxing\\examples' # defaults to a directory "whiteboxing" in the working directory
# project_dir = 'C:\\Users\\Crutt\\Documents\\whiteboxing\\examples'
random_state_splits = 123 # one off for splitting the data into test / train
random_state = 123 # for everything else - e.g. building a new rf with same data

# Build a Random Forest Model to Predict and Explain
First, a wrapper is created for the dataset. Use one that ships with the package, or create your own.
Then split the data into training and (hold out) test set using the convenience functions in the package. These return an object that contain the split data in various representations, such as Pandas DataFrames and encoded, sparse matrices.

In [None]:
# load one of the included datasets
# project_dir will default to directory name CHIRPS in the working directory if not given
# random_state will default to 123
mydata = rp.datasets[0](random_state=random_state, project_dir=project_dir)
meta_data = mydata.get_meta()
save_path = meta_data['get_save_path']()

# split the data. here using a basic sampling method.
# the returned object is a wrapper class that contains
# the train and test splits for X and y

# also the the encoded versions of X_train and X_test that the rf will use
# this is because we prefer onehot encoded over allowing categorical vars to be represented as integer
# scikit would treat these as ordinal, which is inappropriate

# also some meta-data: priors for y, the indexes from the input data

# also some convenience functions for leave-one-out testing

# train test split - one off hard-coded random state.
# random state can be ommitted 
# and will default to the state held in the dataset container
# which defaults to 123 if ommitted in the constructor
train_index, test_index = mydata.get_tt_split_idx(random_state=random_state_splits)
# optionally, indexes can be ommitted and will default to scikit's train_test_split method
tt = mydata.tt_split(train_index, test_index)

In [None]:
# build model
model='GBM'
# tuning_grid = {'max_depth' : [d for d in [i for i in range(1, 5)]],
#               'n_estimators' : [(i + 1) * 200 for i in range(2)],
#               'learning_rate' : [i/10 for i in range(1, 11)],
#               'subsample' : [0.25, 0.5, 0.75, 1.0]}

tuning_grid = {'max_depth' : [d for d in [i for i in range(1, 2)]],
                'n_estimators' : [(i + 1) * 200 for i in range(1, 2)],
                'learning_rate' : [i/10 for i in range(1, 2)],
                'subsample' : [1.0]}


rf = rp.forest_prep(ds_container=tt,
                    meta_data=meta_data,
                    override_tuning=True,
                    model=model,
                    tuning_grid=tuning_grid,
                    save_path=save_path,
                    plot_cm=True, plot_cm_norm=True)


In [None]:
# control for async processes - each tree walk can be done in its own core
# and so can each explanation (e.g. rule conditions merge by hill-climbing)
# these will default to false if not passed explicitly to the explainer function
# on a multi-core machine there should be a good speed up for large batches
# when the batch_size advantage exceeds the overhead of setting up multi-processing
# timings will be printed to screen so you can see if it helps
forest_walk_async=False
chirps_explanation_async=False

# how many instances to explain in total from a test/unseen set
n_instances = 1

# this will normalise the above parameters to the size of the dataset
n_instances = rt.n_instance_ceiling(ds_container=tt, n_instances=n_instances)

# this gets the next batch out of the data_split_container according to the required number of instances
# all formats can be extracted, depending on the requirement
# unencoded, encoded (sparse matrix is the type returned by scikit), ordinary dense matrix also available
instances, _, instances_enc, instances_enc_matrix, labels = tt.get_next(n_instances, which_split='test') # default

In [None]:
# get all the model predictions for the test instance(s) we're looking at
preds_idx = labels.index
preds = rf.predict(X=instances_enc)

f_walker = strcts.forest_walker(forest = rf, meta_data=meta_data)

print('Walking forest for ' + str(len(labels)) + ' instances... (please wait)')

# set the timer
forest_walk_start_time = timeit.default_timer()

# do the walk - returns a batch_paths_container (even for just one instance)
# requires the X instances in a matrix (dense, ordinary numpy matrix) - this is available in the data_split_container
bp_container = f_walker.forest_walk(instances = instances_enc_matrix
                        , labels = preds # we're explaining the prediction, not the true label!
                        , forest_walk_async = forest_walk_async)

# stop the timer
forest_walk_end_time = timeit.default_timer()
forest_walk_elapsed_time = forest_walk_end_time - forest_walk_start_time

print('Forest Walk with async = ' + str(forest_walk_async))
print('Forest Walk time elapsed:', "{:0.4f}".format(forest_walk_elapsed_time), 'seconds')

In [None]:
f_walker.full_survey(instances_enc, labels)
f_walker.tree_outputs

In [None]:
rf.loss_._score_to_proba(rf.decision_function(instances_enc))
#dir(rf.loss_)

In [None]:
np.exp(rf.decision_function(instances_enc).ravel())/(1 + np.exp(rf.decision_function(instances_enc).ravel()))

In [None]:
rf._init_decision_function(instances_enc)

In [None]:
rf.decision_function(instances_enc)

In [None]:
# get what the model predicts on the training sample
sample_labels = rf.predict(tt.X_train_enc)

# build CHIRPS and a rule for each instance represented in the batch paths container
CHIRPS = strcts.batch_CHIRPS_explainer(bp_container,
                                forest=rf,
                                sample_instances=tt.X_train_enc, # any representative sample can be used
                                # sample_labels=tt.y_train,  # any representative sample can be used
                                sample_labels=sample_labels,
                                meta_data=meta_data)

print('Running CHIRPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
# start a timer
ce_start_time = timeit.default_timer()

CHIRPS.batch_run_CHIRPS(chirps_explanation_async=chirps_explanation_async,
                        paths_lengths_threshold=5,
                        alpha_paths=0.0,
                        support_paths=0.01,
                        score_func=1,
                        disc_path_bins=4,
                        target_classes=preds,
                        merging_bootstraps=20,
                        pruning_bootstraps=20,
                        delta=0.1,
                        weighting='chisq')

ce_end_time = timeit.default_timer()
ce_elapsed_time = ce_end_time - ce_start_time
print('CHIRPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
print('CHIRPS with async = ' + str(chirps_explanation_async))

In [None]:
# iterate over all the test instances to determine the various scores using leave-one-out testing
print('evaluating found explanations')
print()
results_start_time = timeit.default_timer()

rt.evaluate_CHIRPS_explainers(CHIRPS, tt, labels.index, # for batch runs: tt.y_test.index,
                              forest=rf,
                              meta_data=meta_data,
                              model=model,
                              eval_start_time=results_start_time,
                              print_to_screen=True, # set True when running single instances
                              eval_alt_labelings=True,
                              eval_rule_complements=True,
                              save_results_path=save_path,
                              dataset_name='test',
                              save_results_file='CHIRPS' + '_rnst_' + str(random_state),
                              save_CHIRPS=False)

results_end_time = timeit.default_timer()
results_elapsed_time = results_end_time - results_start_time
print('CHIRPS batch results eval time elapsed:', "{:0.4f}".format(results_elapsed_time), 'seconds')
# this completes the CHIRPS runs

In [None]:
import numpy as np
n_kwargs = 96

disc_path_bins = np.tile(np.tile(np.repeat([4, 8], 3), 8), 2)
disc_path_eqcounts = np.tile(np.tile(np.repeat([True, False], 6), 4), 2)
support_paths = np.tile(np.tile([0.05, 0.02, 0.01], 16), 2)
weighting = np.tile(np.repeat(['chisq', 'kldiv', 'lodds', 'nothing'], 12), 2)
which_trees = np.repeat(['majority', 'conf_weighted'], 48)

kwargs_grid = {k : {'paths_lengths_threshold' : 5, 'alpha_paths' : 0.0,
                    'disc_path_bins' : dpb, 'disc_path_eqcounts' : dpeq,
                    'score_func' : 1, 'weighting' : w, 'support_paths' : sp,
                    'merging_bootstraps' : merging_bootstraps,
                    'pruning_bootstraps' : pruning_bootstraps,
                    'which_trees' : wchtr,
                    'delta' : delta} 
for k, dpb, dpeq, w, sp, wchtr \
               in zip(range(n_kwargs), disc_path_bins, disc_path_eqcounts, weighting, support_paths, which_trees)}

kwargs_grid

In [None]:
fail

In [None]:
p_count_corrected([0, 1, 2, 3], [0, 1,2,3], [ 3.46192192, 78.32036219, 58.81097884, 62.69848053])

In [None]:
x = np.array([3.35743396e-06, 4.16315283e-01, 2.07411357e-01, 3.76270003e-01])
y = np.where(x != x[np.argmax(x)])
x[y][np.argmax(x[y])]

In [None]:
weights=np.array([[-77.14896904 , 30.46996862 , 23.13920312 , 23.5397973 ],
 [-78.17874831 , 27.48592897 , 23.07330235 , 27.61951699],
 [-52.94961755 , 53.9136018 ,  51.9856333 , -52.94961755]])

arr=[1,3,1]
print(np.shape(weights)[0])
print(weights[range(np.shape(weights)[0]),arr])
print(np.array(weights[range(np.shape(weights)[0]), arr]))

p_count_corrected(arr, [0,1,2,3], weights=weights)['counts']
weights.shape

In [None]:
cf = confidence_weight(np.array([[0.1, 0.1, 0.2, 0.6], [0.05, 0.05, 0.2, 0.7], [0.1, 0.5, 0.2, 0.2], [0.1, 0.2, 0.3, 0.4]]))
#print(cf)
p_count_corrected([3,3,1,3], [0, 1, 2, 3], weights=cf)
# cf[range(4), [0,1,2,1]]

In [None]:
X = tt.X_test_enc[0:3,]
pred_raw = [confidence_weight(estimator.predict_proba(X), 'proba') for estimator in rf.estimators_]
print(np.mean(pred_raw, axis = 0))
pred_log = [confidence_weight(estimator.predict_proba(X), 'log_proba') for estimator in rf.estimators_]
print(np.mean(pred_log, axis = 0))
pred = [confidence_weight(estimator.predict_proba(X), 'conf_weight') for estimator in rf.estimators_]
print(np.mean(pred, axis = 0))

In [None]:
pred = [confidence_weight(estimator.predict_proba(X), 'conf_weight') for estimator in rf.estimators_]
pred = sum(pred)
pred /= rf.estimator_weights_.sum()
print(pred)
pred = pred / (len(pred[0]) - 1)
pred = np.exp(pred)
print(pred)
normalizer = pred.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
print(normalizer)
pred /= normalizer
pred


In [None]:
pred = [confidence_weight(estimator.predict_proba(X), 'log_proba') for estimator in rf.estimators_]
pred = np.mean(pred, axis=0)
#pred /= rf.estimator_weights_.sum()
print(pred)
pred = np.exp(pred)
print(pred)
normalizer = pred.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
print(normalizer)
pred /= normalizer
pred

In [None]:
rf.predict_proba(X)
#np.exp(rf.decision_function(X))
# pred = rf.decision_function(X) + abs(rf.decision_function(X).min(axis=1).reshape(-1, 1))
# normalizer = pred.sum(axis=1)[:, np.newaxis]
# pred /= normalizer
# pred
#np.array(rf.decision_function(X)) * rf.estimator_weights_.sum()

In [None]:
y = np.array([-24.05452802,  -1.01532182, -12.34344625, -12.62110368])
x = np.array([-34.63778423,  34.47983437,   0.49546108,  -0.33751122]) / len(y)
z = np.array([0.01626313, 0.49611181, 0.22083229, 0.26679277])
x
(y - np.mean(y)) * 3
#np.exp(y) / np.sum(np.exp(y))
# np.exp(x + np.mean(y)) / np.sum(np.exp(x + np.mean(y)))

In [None]:
bp_container.path_detail[0][0]

In [None]:
from operator import itemgetter
tree_preds, estimator_weights = [i for i in map(list, zip(*[itemgetter('pred_class', 'estimator_weight')(bp_container.path_detail[t][0]) for t in range(1)]))]

p_count_corrected(tree_preds, [i for i in range(len(meta_data['class_names']))], weights=estimator_weights)

tree_preds, confidence_weights = [i for i in map(list, zip(*[itemgetter('pred_class', 'confidence_weight')(bp_container.path_detail[t][0]) for t in range(1)]))]

confidence_weights

In [None]:
print(sum(pred_log).sum(axis=1)[:, np.newaxis])
print(sum(pred_log).sum(axis=1)[:, np.newaxis]/n_classes)
print(sum(pred_log) - sum(pred_log).sum(axis=1)[:, np.newaxis]/n_classes)

In [None]:
p_count_corrected(rf.predict(X), [i for i in range(n_classes)])

In [None]:
preds = np.array([estimator.predict(X) for estimator in rf.estimators_])
preds = preds.reshape(1, -1)[0]
wts = [2 if p == 1 else 1 for p in preds]
p_count_corrected(preds, [i for i in range(n_classes)], wts)

In [None]:
dir(rf.estimators_[0]._abc_impl)

In [None]:
print(rf.predict_proba(tt.X_test_enc[0]))

proba = rf.predict_proba(tt.X_test_enc[0])
proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
log_proba = np.log(proba)
# print(log_proba)
# print(log_proba.sum(axis=1)[:, np.newaxis])
# print( (log_proba - (1. / n_classes)
#                               * log_proba.sum(axis=1)[:, np.newaxis]))
pred = (n_classes - 1) * (log_proba - (1. / n_classes)
                              * log_proba.sum(axis=1)[:, np.newaxis]) 
print(pred)
pred /= rf.estimator_weights_.sum()
if n_classes == 2:
    pred[:, 0] *= -1
    print(pred.sum(axis=1))
print(pred)

In [None]:
# (n_classes - 1) * (log_proba - (1. / n_classes)
#                               * log_proba.sum(axis=1)[:, np.newaxis])

3 * (log_proba - (1/4) * log_proba.sum(axis=1)[:, np.newaxis])

In [None]:
pred = sum(_samme_proba(estimator, n_classes, X)
                       for estimator in rf.estimators_)
print(pred)
print(rf.estimator_weights_.sum())
pred /= rf.estimator_weights_.sum()
print(pred)

pred = np.exp((1. / (n_classes - 1)) * pred)
print(pred)
normalizer = pred.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
pred /= normalizer
print(normalizer)
print(pred)

In [None]:
sum(_samme_proba(estimator, n_classes, X)
                       for estimator in self.estimators_)

r_pred = [(estimator.predict(instances_enc) == rf.classes_) for estimator in rf.estimators_]
# print(r_pred)
rw_pred = [(estimator.predict(instances_enc) == rf.classes_).T * w for estimator, w in zip(rf.estimators_, rf.estimator_weights_)]
# print(rw_pred)
pred = sum((estimator.predict(instances_enc) == rf.classes_).T * w
           for estimator, w in zip(rf.estimators_,
                                   rf.estimator_weights_))


In [None]:
n_classes = rf.n_classes_
print(n_classes)
proba = sum(estimator.predict_proba(instances_enc) * w
                        for estimator, w in zip(rf.estimators_,
                                                rf.estimator_weights_))

print(proba)
proba /= rf.estimator_weights_.sum()
print(proba)
proba = np.exp((1. / (n_classes - 1)) * proba)
print(proba)
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
print(normalizer)
proba /= normalizer
print(proba)

In [None]:
# if algorithm == 'SAMME.R':
n_classes = 4
X = tt.X_train_enc[0]
pred = sum(_samme_proba(estimator, n_classes, X)
                   for estimator in rf.estimators_)
#             else:   # self.algorithm == "SAMME"
#             pred = sum((estimator.predict(X) == classes).T * w
#                        for estimator, w in zip(self.estimators_,
#                                                self.estimator_weights_))

pred /= rf.estimator_weights_.sum()
if n_classes == 2:
    pred[:, 0] *= -1

# pred.sum(axis=1)
pred

In [None]:
# how is GB working to calculate the predict function
import numpy as np
n = 300
gbpreds = [gb.predict(X_test[i])[0] for i in range(n)]
reg_tots = np.empty(n)
for i in range(n):
    reg_tots[i] = sum([gb.estimators_[j][0].predict(X_test[i])[0] for j in range(ne)])

mn = np.inf
mx = -np.inf
for pred, tot in zip(gbpreds, reg_tots):
    if pred == 0:
        if mx < tot:
            mx = tot
    if pred == 1:
        if mn > tot:
            mn = tot
            
print(mx, mn)

# Preparing unseen data

Again note:
test set has never been "seen" by random forest during training
test set has been only used to assess model (random forest) accuracy - no additional tuning after this
test set has not be involved in generating the explainer

## optional: memory and computation cost management
#### CHIRPS is time economical but memory intensive to compute for lots of instances at once
option 1: choose a smaller number of instances to explain

In [None]:
# control for async processes - each tree walk can be done in its own core
# and so can each explanation (e.g. rule conditions merge by hill-climbing)
# these will default to false if not passed explicitly to the explainer function
# on a multi-core machine there should be a good speed up for large batches
# when the batch_size advantage exceeds the overhead of setting up multi-processing
# timings will be printed to screen so you can see if it helps
forest_walk_async=False
chirps_explanation_async=False

# the number of instances can be controlled by
# batch_size - how many instances to explain at one time
batch_size = 1
# how many instances to explain in total from a test/unseen set
n_instances = 1

# this will normalise the above parameters to the size of the dataset
n_instances, n_batches = rt.batch_instance_ceiling(ds_container=tt, n_instances=n_instances, batch_size=batch_size)

# this gets the next batch out of the data_split_container according to the required number of instances
# all formats can be extracted, depending on the requirement
# unencoded, encoded (sparse matrix is the type returned by scikit), ordinary dense matrix also available
instances, _, instances_enc, instances_enc_matrix, labels = tt.get_next(batch_size, which_split='test') # default

option 2: just run the whole test set

In [None]:
# instances = tt.X_test; instances_enc = tt.X_test_enc; instances_enc_matrix = tt.X_test_enc_matrix; labels = tt.y_test

## Make predictions from the decision forest on the unseen data
Important point, no compromise on model accuracy

In [None]:
# get all the model predictions for the test instance(s) we're looking at
preds_idx = labels.index
preds = rf.predict(X=instances_enc)

# CHIRPS Step 1:
## Extract Tree Prediction Paths
### Fit a forest_walker object to the dataset and decision forest
This is a wrapper will extracts the paths of all the given instances. For CHIRPS, we want a large sample. The whole training set or other representative sample will do.

It can also report interesting statistics (treating the forest as a set of random tree-structured variables).

In [None]:
# wrapper object needs the decision forest itself and the dataset meta data (we have a convenience function for this)
f_walker = strcts.forest_walker(forest = rf, meta_data=meta_data)

Now the work of extracting all the paths for each instance is done

In [None]:
print('Walking forest for ' + str(len(labels)) + ' instances... (please wait)')

# set the timer
forest_walk_start_time = timeit.default_timer()

# do the walk - returns a batch_paths_container (even for just one instance)
# requires the X instances in a matrix (dense, ordinary numpy matrix) - this is available in the data_split_container
bp_container = f_walker.forest_walk(instances = instances_enc_matrix
                        , labels = preds # we're explaining the prediction, not the true label!
                        , forest_walk_async = forest_walk_async)

# stop the timer
forest_walk_end_time = timeit.default_timer()
forest_walk_elapsed_time = forest_walk_end_time - forest_walk_start_time

print('Forest Walk with async = ' + str(forest_walk_async))
print('Forest Walk time elapsed:', "{:0.4f}".format(forest_walk_elapsed_time), 'seconds')

# CHIRPS Steps 2-4: 
## Freqent pattern mining of paths.
## Score and sort mined path segments.
## Merge path segments into one rule.

This is a wrapper object that will execute steps 2-4 on all the instance-paths in the batch_paths_container.

Note that true_divide warnings are OK. It just means that a continuous variable is unbounded in some way i.e. no greater/less than discontinuity is used in the CHIRPS explanation.

Note also, here we are using the training set to create the explainers. We could use a different dataset as long as it is representative of the training set that built the decision forest. Most important that we don't use the dataset that we wish to explain.

In [None]:
# get what the model predicts on the training sample
sample_labels = rf.predict(tt.X_train_enc)

# build CHIRPS and a rule for each instance represented in the batch paths container
CHIRPS = strcts.batch_CHIRPS_explainer(bp_container,
                                forest=rf,
                                sample_instances=tt.X_train_enc, # any representative sample can be used
                                # sample_labels=tt.y_train,  # any representative sample can be used
                                sample_labels=sample_labels,
                                meta_data=meta_data)

print('Running CHIRPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
# start a timer
ce_start_time = timeit.default_timer()

CHIRPS.batch_run_CHIRPS(chirps_explanation_async=chirps_explanation_async,
                        alpha_paths=0.9,
                        support_paths=0.1,
                        score_func=5,
                        disc_path_bins=4,
                        target_classes=preds,
                        merging_bootstraps=20,
                        pruning_bootstraps=20,
                        delta=0.1,
                        weighting='chisq')

ce_end_time = timeit.default_timer()
ce_elapsed_time = ce_end_time - ce_start_time
print('CHIRPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
print('CHIRPS with async = ' + str(chirps_explanation_async))

# Viewing and Evaluating CHIRPS explanations
Evaluation is done using unseen data to see how well the explanations generalise. The data_split_container object (tt) has a  leave-one-out function that is used during the routine to ensure that the datum we are explaining is excluded from the evaluation.

In [None]:
# iterate over all the test instances to determine the various scores using leave-one-out testing
print('evaluating found explanations')
print()
results_start_time = timeit.default_timer()

rt.evaluate_CHIRPS_explainers(CHIRPS, tt, tt.y_test.index,
                              forest=rf,
                              meta_data=meta_data,
                              eval_start_time=results_start_time,
                              print_to_screen=True, # set True when running single instances
                              eval_alt_labelings=True,
                              eval_rule_complements=True,
                              save_results_path=save_path,
                              dataset_name='test',
                              save_results_file='CHIRPS' + '_rnst_' + str(random_state),
                              save_CHIRPS=False)

results_end_time = timeit.default_timer()
results_elapsed_time = results_end_time - results_start_time
print('CHIRPS batch results eval time elapsed:', "{:0.4f}".format(results_elapsed_time), 'seconds')
# this completes the CHIRPS runs

In [None]:
from pandas import Series
forest = rf
train_pred_labels = Series(forest.predict(tt.X_train_enc), index = tt.y_train.index)
CHIRPS.CHIRPS_explainers[0].evaluate_rule(rule='pruned', sample_instances=tt.X_train_enc, sample_labels=train_pred_labels)

In [None]:
CHIRPS.CHIRPS_explainers[0].posterior

In [None]:
preds = rf.predict_proba(X=instances_enc)

In [None]:
preds

In [None]:
np.log(preds) - 0.5 * np.log(preds).sum(axis=1)[:, np.newaxis]

In [None]:
pred = sum(_samme_proba(estimator, n_classes, X)
                       for estimator in self.estimators_)

In [None]:
pred = sum(_samme_proba(estimator, 2, instances_enc)
                       for estimator in rf.estimators_)

In [None]:
sum(_samme_proba(rf.estimators_[0], 2, instances_enc))

In [None]:
rf.estimators_[0].predict(instances_enc)

In [None]:
dir(rf)

In [None]:
import math
[math.exp(ew)/(1+math.exp(ew)) - 0.5 for ew in rf.estimator_weights_]

In [None]:
rf.estimator_errors_

In [None]:
rf.decision_function(instances_enc)

In [None]:
instances_enc

In [None]:
rf.predict(instances_enc)

In [None]:
rf.predict_proba(instances_enc)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le_dict = LabelEncoder().fit(mydata.data['workclass'].unique())

In [None]:
mydata.data

In [None]:
categoricals = [i for i, dt in enumerate(mydata.data.dtypes.values) if dt.name == 'object']

In [None]:
categoricals = [i for i, dt in enumerate(mydata.data.dtypes.values)]
categoricals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [None]:
oh_dict = OneHotEncoder(categories='auto').fit(mydata.data)

In [None]:
oh_dict.categories_

In [None]:
oh_dict.get_feature_names()

In [None]:
oh_dict.transform(mydata.data['workclass'])

In [None]:
le_dict.classes_

In [None]:
le_dict.transform(mydata.data['workclass'])

In [None]:
-np.sum(np.log([0.000000000000000000000000000000001, 0.8430962343096225, 0.07322175732217619, 0.08368200836820137]) * \
[0.0, 0.8430962343096225, 0.07322175732217619, 0.08368200836820137])

In [None]:
np.log([0.00000000000001, 0.9999999782235003, 2.1776499755250584e-08, 0.00000000000000001])

In [None]:
np.log(0.9999999782235003)

In [None]:
entropy_corrected([0.95, 0.0, 0.0, 0.05])

In [None]:
contingency_test([0.4, 0.6], [0.9, 0.1], 'kldiv')

In [None]:
merging_bootstraps = 20
pruning_bootstraps = 20
delta = 0.1 # prune rule terms if loss of precision no greater than delta

disc_path_bins = np.tile(np.repeat([4, 8], 3), 8)
disc_path_eqcounts = np.tile(np.repeat([True, False], 6), 4)
support_paths = np.tile([0.05, 0.02, 0.01], 16)
weighting = np.repeat(['chisq', 'kldiv', 'lodds', 'nothing'], 12)

kwargs_grid = {k : {'paths_lengths_threshold' : 5, 'alpha_paths' : 0.0,
                    'disc_path_bins' : dpb, 'disc_path_eqcounts' : dpeq,
                    'score_func' : 1, 'weighting' : w, 'support_paths' : sp,
                    'merging_bootstraps' : merging_bootstraps,
                    'pruning_bootstraps' : pruning_bootstraps,
                    'delta' : delta} 
for k, dpb, dpeq, w, sp in zip(range(48), disc_path_bins, disc_path_eqcounts, weighting, support_paths)}
kwargs_grid