# prologue

### set up notebook and load package

In [1]:
# for notebook plotting
%matplotlib inline 

# load what we need
import time
import timeit
import numpy as np
import CHIRPS.structures as strcts
import CHIRPS.datasets as ds
import CHIRPS.datasets_proprietary as dsp
import CHIRPS.routines as rt

# demo datasets that ship with package. all from UCI unless stated otherwise
# import CHIRPS.datasets as ds
# ds.adult_data, ds.adult_samp_data, ds.adult_small_samp_data Large dataset ships with manageable sub samples
# ds.bankmark_data, ds.bankmark_samp_data
# ds.car_data
# ds.cardio_data this is the cardiotocography dataset
# ds.credit_data
# ds.german_data
# ds.lending_data, ds.lending_samp_data, ds.lending_small_samp_data, ds.lending_tiny_samp_data from Kaggle. see datasets_from_source file for links
# ds.nursery_data, ds.nursery_samp_data
# ds.rcdv_data, ds.rcdv_samp_data from US government see datasets_from_source file for links

### common config - can be ommitted if defaults are OK

In [2]:
# location to save results
project_dir = '/datadisk/whiteboxing/examples2'
# project_dir = 'V:\\whiteboxing\\2020' # defaults to a directory "whiteboxing" in the working directory
# project_dir = 'C:\\Users\\Crutt\\Documents\\whiteboxing\\2020'

random_state_splits = 123 # one off for splitting the data into test / train
random_state = 123 # for everything else - e.g. building a new rf with same data

# Build a Random Forest Model to Predict and Explain
First, a wrapper is created for the dataset. Use one that ships with the package, or create your own.
Then split the data into training and (hold out) test set using the convenience functions in the package. These return an object that contain the split data in various representations, such as Pandas DataFrames and encoded, sparse matrices.

In [3]:
# load one of the included datasets
# project_dir will default to directory name CHIRPS in the working directory if not given
# random_state will default to 123
override_tuning = False
mydata = ds.german(random_state=random_state_splits, project_dir=project_dir)

meta_data = mydata.get_meta()
save_path = meta_data['get_save_path']()

# split the data. here using a basic sampling method.
# the returned object is a wrapper class that contains
# the train and test splits for X and y

# also the the encoded versions of X_train and X_test that the rf will use
# this is because we prefer onehot encoded over allowing categorical vars to be represented as integer
# scikit would treat these as ordinal, which is inappropriate

# also some meta-data: priors for y, the indexes from the input data

# also some convenience functions for leave-one-out testing

# train test split - one off hard-coded random state.
# random state can be ommitted 
# and will default to the state held in the dataset container
# which defaults to 123 if ommitted in the constructor
train_index, test_index = mydata.get_tt_split_idx(random_state=random_state_splits)
# optionally, indexes can be ommitted and will default to scikit's train_test_split method
tt = mydata.tt_split(train_index, test_index)

# CHOOSE ONE
# model = 'RandomForest'
# model = 'AdaBoost1' # SAMME
# model = 'AdaBoost2' # SAMME.R
model = 'GBM'

# decide if to run the whole tuning routine again (long for Adaboost)
# RF routine has a default tuning grid, so can leave as None, or come up with some other options
tuning = {'grid' : None, 'override' : override_tuning}
if model == 'RandomForest':
    which_trees = 'majority'
    tuning.update({'grid' : {'n_estimators': [(i + 1) * 200 for i in range(8)],
                            'max_depth' : [32]}})

elif model in ('AdaBoost1', 'AdaBoost2'):
    if model == 'AdaBoost1':
        # classic (and multi-class) AdaBoost
        algo = 'SAMME'
        which_trees = 'majority'
    else:
        # real-valued AdaBoost
        algo = 'SAMME.R'
        which_trees = 'conf_weighted'
    max_depth = [i for i in range(1, 5)]
    tuning.update({'grid' : {'base_estimator' : [rt.DecisionTreeClassifier(max_depth=d) for d in max_depth],
                            'n_estimators': [(i + 1) * 200 for i in range(8)], 'algorithm': [algo]}})
    
else: # GBM
    tuning.update({'grid' : {'subsample' : [0.5],
                        'n_estimators': [i * 200 for i in range(1, 9)],
                        'max_depth' : [i for i in range(1, 5)],
                        'learning_rate': np.full(4, 10.0)**[i for i in range(-3, 1)]}})

rf = rt.forest_prep(ds_container=tt,
                    meta_data=meta_data,
                    override_tuning=override_tuning,
                    model=model,
                    tuning_grid=tuning['grid'],
                    save_path=save_path,
                    plot_cm=False, plot_cm_norm=False)

# get what the model predicts on the training sample
sample_labels = rf.predict(tt.X_train_enc)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


using previous tuning parameters
Best OOB Accuracy Estimate during tuning: 0.7643
Best parameters:{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 1600, 'subsample': 0.5, 'random_state': 123}

[[176  36]
 [ 35  53]]


# Preparing unseen data

Again note:
test set has never been "seen" by random forest during training
test set has been only used to assess model (random forest) accuracy - no additional tuning after this
test set has not be involved in generating the explainer

## optional: memory and computation cost management
#### CHIRPS is time economical but memory intensive to compute for lots of instances at once
option 1: choose a smaller number of instances to explain

In [4]:
# control for async processes - each tree walk can be done in its own core
# and so can each explanation (e.g. rule conditions merge by hill-climbing)
# these will default to false if not passed explicitly to the explainer function
# on a multi-core machine there should be a good speed up for large batches
# when the batch_size advantage exceeds the overhead of setting up multi-processing
# timings will be printed to screen so you can see if it helps
forest_walk_async=True
explanation_async=True

# how many instances to explain in total from a test/unseen set
# doesn't matter if you don't know how large the dataset is
# this function prevents you maxing out, or put n_instances = None for whole dataset
n_instances = rt.n_instance_ceiling(ds_container=tt, n_instances=30)

# this gets the next batch out of the data_split_container according to the required number of instances
# all formats can be extracted, depending on the requirement
# unencoded, encoded (sparse matrix is the type returned by scikit), ordinary dense matrix also available
tt.current_row_test = 0
instances, _, instances_enc, instances_enc_matrix, labels = tt.get_next(n_instances, which_split='test') # default

option 2: just run the whole test set

In [5]:
# instances = tt.X_test; instances_enc = tt.X_test_enc; instances_enc_matrix = tt.X_test_enc_matrix; labels = tt.y_test

## Make predictions from the decision forest on the unseen data
Important point, no compromise on model accuracy

In [6]:
# get all the model predictions for the test instance(s) we're looking at
preds_idx = labels.index
preds = rf.predict(X=instances_enc)

# CHIRPS Step 1:
## Extract Tree Prediction Paths
### Fit a forest_walker object to the dataset and decision forest
This is a wrapper will extracts the paths of all the given instances. For CHIRPS, we want a large sample. The whole training set or other representative sample will do.

It can also report interesting statistics (treating the forest as a set of random tree-structured variables).

In [7]:
# wrapper object needs the decision forest itself and the dataset meta data 
if model == 'GBM':
    f_walker = strcts.regression_trees_walker(forest = rf, meta_data=meta_data)
else:
    f_walker = strcts.classification_trees_walker(forest = rf, meta_data=meta_data)

In [8]:
print('Walking forest for ' + str(len(labels)) + ' instances... (please wait)')

# set the timer
forest_walk_start_time = timeit.default_timer()

# do the walk - creates a paths_container (even for just one instance) as a new property
# requires the X instances in a matrix (dense, ordinary numpy matrix) - this is available in the data_split_container
f_walker.forest_walk(instances = instances_enc_matrix
                    , labels = preds # we're explaining the prediction, not the true label!
                    , forest_walk_async = forest_walk_async)

# stop the timer
forest_walk_end_time = timeit.default_timer()
forest_walk_elapsed_time = forest_walk_end_time - forest_walk_start_time

print('Forest Walk with async = ' + str(forest_walk_async))
print('Forest Walk time elapsed:', "{:0.4f}".format(forest_walk_elapsed_time), 'seconds')

Walking forest for 30 instances... (please wait)
Forest Walk with async = True
Forest Walk time elapsed: 2.1279 seconds


# CHIRPS Steps 2-4: 
## Freqent pattern mining of paths.
## Score and sort mined path segments.
## Merge path segments into one rule.

This is a wrapper object that will execute steps 2-4 on all the instance-paths in the batch_paths_container.

Note that true_divide warnings are OK. It just means that a continuous variable is unbounded in some way i.e. no greater/less than discontinuity is used in the CHIRPS explanation.

Note also, here we are using the training set to create the explainers. We could use a different dataset as long as it is representative of the training set that built the decision forest. Most important that we don't use the dataset that we wish to explain.

In [9]:
if model == 'GBM':
    # create a GBHIPS container object for the forest path detail
    explanations = strcts.GBHIPS_container(f_walker.path_detail,
                                    forest=rf,
                                    sample_instances=tt.X_train_enc, # any representative sample can be used
                                    sample_labels=sample_labels,
                                    meta_data=meta_data)
    
    print('Running GBHIPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
    # start a timer
    ce_start_time = timeit.default_timer()

    # run the explanation algorithm for Gradient Boosted Trees on all the instance path details
    explanations.run_explanations(target_classes=preds, # we're explaining the prediction, not the true label!
                            explanation_async=explanation_async,
                            random_state=random_state,
                            paths_lengths_threshold=5,
                            which_trees='targetclass',
                            alpha_paths=0.0,
                            support_paths=0.05,
                            score_func=1,
                            precis_threshold=0.99,
                            disc_path_bins=6,
                            merging_bootstraps=20,
                            pruning_bootstraps=20,
                            delta=0.2,
                            weighting='kldiv')
    
    # For unbalanced data in binary classification, if an instance has a negative margin, 
    # that means it moved towards the boundary (say prob = 0.5 or lodds = 0)
    # but did not move far enough to change class (say from > 0.75 prior).
    # However, one can say it has some similarities to the other class.
    # These can be viewed by setting the other class as the target and 
    # which_trees = 'signdelta' (trees that agree with the sign of the total change of lodds)

    ce_end_time = timeit.default_timer()
    ce_elapsed_time = ce_end_time - ce_start_time
    print('GBHIPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
    print('GBHIPS with async = ' + str(explanation_async))
    
if model == 'RandomForest':
    # create a CHIRPS container for the forest path detail
    explanations = strcts.CHIRPS_container(f_walker.path_detail,
                                    forest=rf,
                                    sample_instances=tt.X_train_enc, # any representative sample can be used
                                    sample_labels=sample_labels,
                                    meta_data=meta_data)
    
    print('Running CHIRPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
    # start a timer
    ce_start_time = timeit.default_timer()
    
    # run the explanation algorithm for Random Forest or AdaBoost on all the instance path details
    explanations.run_explanations(target_classes=preds, # we're explaining the prediction, not the true label!
                            explanation_async=explanation_async,
                            random_state=random_state,
                            which_trees=which_trees,
                            alpha_paths=0.0,
                            support_paths=0.01,
                            score_func=1,
                            precis_threshold=0.99,
                            disc_path_bins=4,
                            merging_bootstraps=20,
                            pruning_bootstraps=20,
                            delta=0.2,
                            weighting='nothing')

    ce_end_time = timeit.default_timer()
    ce_elapsed_time = ce_end_time - ce_start_time
    print('CHIRPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
    print('CHIRPS with async = ' + str(explanation_async))
    
if model == 'AdaBoost1':
    # create a CHIRPS container for the forest path detail
    explanations = strcts.CHIRPS_container(f_walker.path_detail,
                                    forest=rf,
                                    sample_instances=tt.X_train_enc, # any representative sample can be used
                                    sample_labels=sample_labels,
                                    meta_data=meta_data)
    
    print('Running CHIRPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
    # start a timer
    ce_start_time = timeit.default_timer()
    
    # run the explanation algorithm for Random Forest or AdaBoost on all the instance path details
    explanations.run_explanations(target_classes=preds, # we're explaining the prediction, not the true label!
                            explanation_async=explanation_async,
                            random_state=random_state,
                            which_trees=which_trees,
                            alpha_paths=0.0,
                            support_paths=0.01,
                            score_func=1,
                            precis_threshold=0.99,
                            disc_path_bins=4,
                            merging_bootstraps=20,
                            pruning_bootstraps=20,
                            delta=0.2,
                            weighting='kldiv')

    ce_end_time = timeit.default_timer()
    ce_elapsed_time = ce_end_time - ce_start_time
    print('CHIRPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
    print('CHIRPS with async = ' + str(explanation_async))
    
if model == 'AdaBoost2':
    # create a CHIRPS container for the forest path detail
    explanations = strcts.CHIRPS_container(f_walker.path_detail,
                                    forest=rf,
                                    sample_instances=tt.X_train_enc, # any representative sample can be used
                                    sample_labels=sample_labels,
                                    meta_data=meta_data)
    
    print('Running CHIRPS on a batch of ' + str(len(labels)) + ' instances... (please wait)')
    # start a timer
    ce_start_time = timeit.default_timer()
    
    # run the explanation algorithm for Random Forest or AdaBoost on all the instance path details
    explanations.run_explanations(target_classes=preds, # we're explaining the prediction, not the true label!
                            explanation_async=explanation_async,
                            random_state=random_state,
                            which_trees=which_trees,
                            alpha_paths=0.00,
                            support_paths=0.001,
                            score_func=1,
                            precis_threshold=0.99,
                            disc_path_bins=8,
                            merging_bootstraps=20,
                            pruning_bootstraps=20,
                            delta=0.2,
                            weighting='kldiv')

    ce_end_time = timeit.default_timer()
    ce_elapsed_time = ce_end_time - ce_start_time
    print('CHIRPS time elapsed:', "{:0.4f}".format(ce_elapsed_time), 'seconds')
    print('CHIRPS with async = ' + str(explanation_async))

Running GBHIPS on a batch of 30 instances... (please wait)
as_chirps for batch_idx 0

start mining for batch_idx 0 with support = 0.05

  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


as_chirps for batch_idx 1
start mining for batch_idx 1 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


as_chirps for batch_idx 2
start mining for batch_idx 2 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


as_chirps for batch_idx 3
start mining for batch_idx 3 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


as_chirps for batch_idx 4
start mining for batch_idx 4 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


as_chirps for batch_idx 5
start mining for batch_idx 5 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


as_chirps for batch_idx 6
start mining for batch_idx 6 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


as_chirps for batch_idx 7
start mining for batch_idx 7 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


found 88 patterns from 761 trees for batch_idx 1
start score sort for batch_idx 1 (88) patterns
start merge rule for batch_idx 1 (88) patterns
found 98 patterns from 838 trees for batch_idx 2
start score sort for batch_idx 2 (98) patterns
start merge rule for batch_idx 2 (98) patterns
found 96 patterns from 990 trees for batch_idx 0
start score sort for batch_idx 0 (96) patterns
start merge rule for batch_idx 0 (96) patterns
found 97 patterns from 1055 trees for batch_idx 4
start score sort for batch_idx 4 (97) patterns
start merge rule for batch_idx 4 (97) patterns
found 96 patterns from 796 trees for batch_idx 6
start score sort for batch_idx 6 (96) patterns
found 93 patterns from 1010 trees for batch_idx 3
start score sort for batch_idx 3 (93) patterns
start merge rule for batch_idx 6 (96) patterns
start merge rule for batch_idx 3 (93) patterns
found 91 patterns from 841 trees for batch_idx 7found 96 patterns from 907 trees for batch_idx 5

start score sort for batch_idx 7 (91) patt

  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


[('dur', True, 16.60345), ('crhis_A31', True, 0.5)]
0.9003322259136213 0.3508890469416785 0.1335678339047698 0.1344174482723094
merge complete for batch_idx 1 (88) patterns
start get explainer for batch_idx 1
as_chirps for batch_idx 9
start mining for batch_idx 9 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


found 95 patterns from 909 trees for batch_idx 8
start score sort for batch_idx 8 (95) patterns
start merge rule for batch_idx 8 (95) patterns
[('chk_A13', False, 0.5)]
0.9464285714285714 0.07633357041251779 0.07111993080483947 0.046035686125636195
merge complete for batch_idx 5 (96) patterns
start get explainer for batch_idx 5
[('chk_A13', False, 0.5)]
0.9464285714285714 0.07633357041251779 0.06292643007197037 0.04179123415546814
merge complete for batch_idx 6 (96) patterns
start get explainer for batch_idx 6
as_chirps for batch_idx 10
start mining for batch_idx 10 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


as_chirps for batch_idx 11

  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin



start mining for batch_idx 11 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


[('emp_A72', False, 0.5), ('dur', False, 22.5), ('chk_A14', True, 0.5)]
0.7878787878787878 0.043688425267372634 0.1570871058834033 0.12110773360838115
merge complete for batch_idx 2 (98) patterns
start get explainer for batch_idx 2
[('chk_A14', False, 0.5), ('amt', True, 7660.23034)]
0.974169741697417 0.37068812233285914 0.10810247293527306 0.089715766012196
merge complete for batch_idx 3 (93) patterns
start get explainer for batch_idx 3
as_chirps for batch_idx 12
found 96 patterns from 931 trees for batch_idx 9
start score sort for batch_idx 9 (96) patterns
start mining for batch_idx 12 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


start merge rule for batch_idx 9 (96) patternsas_chirps for batch_idx 13

start mining for batch_idx 13 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


[('dur', False, 33.11538), ('chk_A14', True, 0.5), ('pps_A43', True, 0.5), ('debt_A103', True, 0.5)]
0.7058823529411765 0.06802328644433908 0.13941368386146744 0.09601869373778718
merge complete for batch_idx 0 (96) patterns
start get explainer for batch_idx 0
[('chk_A11', False, 0.5), ('age', True, 25.23626), ('dur', False, 9.5)]
0.6666666666666666 0.05563458195037142 0.15191603511178955 0.1305515204601579
merge complete for batch_idx 7 (91) patterns
start get explainer for batch_idx 7
as_chirps for batch_idx 14
start mining for batch_idx 14 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


found 93 patterns from 921 trees for batch_idx 10
start score sort for batch_idx 10 (93) patterns
found 90 patterns from 895 trees for batch_idx 11
start score sort for batch_idx 11 (90) patterns
start merge rule for batch_idx 10 (93) patterns
as_chirps for batch_idx 15
start mining for batch_idx 15 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


start merge rule for batch_idx 11 (90) patterns


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


found 95 patterns from 1142 trees for batch_idx 12
start score sort for batch_idx 12 (95) patterns
start merge rule for batch_idx 12 (95) patterns
found 96 patterns from 951 trees for batch_idx 13
start score sort for batch_idx 13 (96) patterns
start merge rule for batch_idx 13 (96) patterns
found 97 patterns from 928 trees for batch_idx 15
start score sort for batch_idx 15 (97) patterns
start merge rule for batch_idx 15 (97) patterns
found 94 patterns from 994 trees for batch_idx 14
start score sort for batch_idx 14 (94) patterns
start merge rule for batch_idx 14 (94) patterns
[('dur', True, 16.68939)]
0.8801261829652997 0.3472617354196301 0.12892739526831545 0.10414612880396862
merge complete for batch_idx 8 (95) patterns
start get explainer for batch_idx 8
as_chirps for batch_idx 16
start mining for batch_idx 16 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


[('chk_A11', True, 0.5)]
0.8574144486692015 0.40529871977240395 0.14892824439265662 0.10910564652150785
merge complete for batch_idx 9 (96) patterns
start get explainer for batch_idx 9
[('svng_A64', False, 0.5)]
0.9230769230769231 0.052302631578947364 0.06938853791729446 0.054675405091459885
merge complete for batch_idx 12 (95) patterns
start get explainer for batch_idx 12
as_chirps for batch_idx 17
start mining for batch_idx 17 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


as_chirps for batch_idx 18
start mining for batch_idx 18 with support = 0.05
found 97 patterns from 1015 trees for batch_idx 16

  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin



start score sort for batch_idx 16 (97) patterns


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


start merge rule for batch_idx 16 (97) patterns
[('amt', False, 11725.47059), ('emp_A74', True, 0.5)]
0.7857142857142857 0.01703809072230125 0.11712458000648203 0.073129386913114
merge complete for batch_idx 13 (96) patterns
start get explainer for batch_idx 13
as_chirps for batch_idx 19
start mining for batch_idx 19 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


[('svng_A64', False, 0.5)]
0.9230769230769231 0.052302631578947364 0.07025762177877748 0.05175484457960335
merge complete for batch_idx 15 (97) patterns
start get explainer for batch_idx 15
as_chirps for batch_idx 20
start mining for batch_idx 20 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


found 98 patterns from 901 trees for batch_idx 17
start score sort for batch_idx 17 (98) patterns
start merge rule for batch_idx 17 (98) patterns
found 97 patterns from 977 trees for batch_idx 18
start score sort for batch_idx 18 (97) patterns
start merge rule for batch_idx 18 (97) patterns
[('dur', True, 13.14394)]
0.8876404494382022 0.3109886201991465 0.16211936505188063 0.14945363773208165
merge complete for batch_idx 10 (93) patterns
start get explainer for batch_idx 10
as_chirps for batch_idx 21
start mining for batch_idx 21 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


found 93 patterns from 904 trees for batch_idx 19
start score sort for batch_idx 19 (93) patterns


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


start merge rule for batch_idx 19 (93) patterns
[('amt', False, 11262.48077)]
0.7222222222222222 0.02263315947526474 0.09946290576359922 0.06267639293705192
merge complete for batch_idx 18 (97) patterns
start get explainer for batch_idx 18
as_chirps for batch_idx 22
start mining for batch_idx 22 with support = 0.05
found 90 patterns from 953 trees for batch_idx 20
start score sort for batch_idx 20 (90) patterns


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


start merge rule for batch_idx 20 (90) patterns
[('dur', True, 11.8125)]
0.9477611940298507 [('chk_A11', False, 0.5), ('pers_A92', False, 0.5), ('crhis_A34', True, 0.5), ('emp_A75', True, 0.5), ('debt_A103', True, 0.5)]
0.7209302325581395 0.05724145197829408 0.12858675617794685 0.11089159499307770.18189900426742533
merge complete for batch_idx 11 (90) patterns
start get explainer for batch_idx 11
 0.09619538912249552 0.07705367055126326

  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans



merge complete for batch_idx 14 (94) patterns
start get explainer for batch_idx 14
as_chirps for batch_idx 23
start mining for batch_idx 23 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


as_chirps for batch_idx 24
start mining for batch_idx 24 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


found 96 patterns from 979 trees for batch_idx 23
start score sort for batch_idx 23 (96) patternsfound 96 patterns from 995 trees for batch_idx 21

found 97 patterns from 768 trees for batch_idx 24start score sort for batch_idx 21 (96) patterns

start score sort for batch_idx 24 (97) patterns
start merge rule for batch_idx 23 (96) patterns
start merge rule for batch_idx 24 (97) patterns
start merge rule for batch_idx 21 (96) patterns
found 93 patterns from 937 trees for batch_idx 22
start score sort for batch_idx 22 (93) patterns
start merge rule for batch_idx 22 (93) patterns
[('chk_A14', False, 0.5)]
0.9479166666666666 0.3737731152204836 0.12929116581594347 0.11233689571381035
merge complete for batch_idx 16 (97) patterns
start get explainer for batch_idx 16
as_chirps for batch_idx 25
start mining for batch_idx 25 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans
  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


[('amt', False, 7968.1413), ('emp_A74', True, 0.5), ('pps_A41', True, 0.5)]
0.8214285714285714 0.0367788841473052 0.11740726347195611 0.08552511876908173
merge complete for batch_idx 19 (93) patterns
start get explainer for batch_idx 19
as_chirps for batch_idx 26
start mining for batch_idx 26 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin


[('dur', False, 28.5), ('chk_A12', False, 0.5)]
0.6382978723404256 0.062233285917496446 0.1504022376129984 0.10557205257728658
merge complete for batch_idx 17 (98) patterns

  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans



start get explainer for batch_idx 17
[('chk_A14', False, 0.5)]
0.9479166666666666 0.3737731152204836 0.13153748848623814 0.10907187636037934
merge complete for batch_idx 20 (90) patterns
start get explainer for batch_idx 20
as_chirps for batch_idx 27
start mining for batch_idx 27 with support = 0.05


  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


found 90 patterns from 1031 trees for batch_idx 25


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin



start merge rule for batch_idx 25 (90) patternsstart score sort for batch_idx 25 (90) patterns
as_chirps for batch_idx 28
start mining for batch_idx 28 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


[('svng_A64', False, 0.5)]
0.9230769230769231 0.052302631578947364 0.07586760547388684 0.052069916762445226
merge complete for batch_idx 21 (96) patterns
start get explainer for batch_idx 21
as_chirps for batch_idx 29
start mining for batch_idx 29 with support = 0.05


  np.histogram(uppers, upper_bins)[0]).round(5) # can result in nans if no value falls into bin
  np.histogram(lowers, lower_bins)[0]).round(5) # can result in nans


[('prop_A121', False, 0.5)]
0.8820754716981132 0.2557788051209104 0.10278431484937683 0.1020067919262398
merge complete for batch_idx 24 (97) patterns
start get explainer for batch_idx 24
[('chk_A11', True, 0.5), ('amt', True, 10699.61111)]
0.8786692759295499 0.45252489331436696 0.12078505935826428 0.12347541259629799
merge complete for batch_idx 23 (96) patterns
start get explainer for batch_idx 23
found 96 patterns from 915 trees for batch_idx 26
start score sort for batch_idx 26 (96) patterns
found 97 patterns from 1027 trees for batch_idx 27
start score sort for batch_idx 27 (97) patterns
start merge rule for batch_idx 26 (96) patterns
start merge rule for batch_idx 27 (97) patterns
found 100 patterns from 972 trees for batch_idx 28
start score sort for batch_idx 28 (100) patterns
start merge rule for batch_idx 28 (100) patterns
[('crhis_A31', False, 0.5)]
0.525 0.05235235235235236 0.06512342916278079 0.051609275456392774
merge complete for batch_idx 26 (96) patterns
start get expl

# Viewing Explanations
For evaluation, see alternative notebook. This session assumes that the system has been tested and tuned.

It is sufficient at this point to examine the explanation stats over the training set since we are explaning the influence of the training set over the model building algorithm.

In [10]:
# iterate over all the test instances to determine the various scores using leave-one-out testing
print('demonstrating found explanations')
print()
results_start_time = timeit.default_timer()

save_results_file = model + '_CHIRPS_rnst_demo' + str(random_state)

rt.demonstrate_explainers(explanations, tt, labels.index, # for full batch runs: tt.y_test.index,
                              forest=rf,
                              meta_data=meta_data,
                              model=model,
                              eval_start_time=results_start_time,
                              print_to_screen=True, # set True when running single instances
                              eval_alt_labelings=True,
                              eval_rule_complements=True,
                              save_results_path=save_path,
                              dataset_name='',
                              save_results_file=save_results_file,
                              save_CHIRPS=False)

results_end_time = timeit.default_timer()
results_elapsed_time = results_end_time - results_start_time
print('CHIRPS batch results eval time elapsed:', "{:0.4f}".format(results_elapsed_time), 'seconds')
# this completes the CHIRPS runs

demonstrating found explanations

INSTANCE RESULTS
instance id: 131 with true class label: 1 (bad)

Model Results for Instance
target (predicted) class: 1 (bad)
target class prior (training data): 0.22857142857142856
forest vote share (unseen instance): 0.61875
forest vote margin (unseen instance): 0.23750000000000004
confidence weighted forest vote share (unseen instance): 0.7567882199505239
confidence weighted forest vote margin (unseen instance): 0.5135764399010467

rule: dur > 33.11538 AND chk_A14 False AND pps_A43 False AND debt_A103 False
rule cardinality: 4
Fraction of total points of rule: 0.13941368386146744
Fraction of total weight of rule: 0.09601869373778718

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.22857142857142856
rule coverage (unseen data): 0.06990014265335236
rule xcoverage (unseen data): 0.06970128022759602
rule precision (unseen data): 0.7291666666666666
rule stability (unseen data): 0.7058823529411765
rule recall (unseen data): 0

INSTANCE RESULTS
instance id: 584 with true class label: 0 (good)

Model Results for Instance
target (predicted) class: 0 (good)
target class prior (training data): 0.7714285714285715
forest vote share (unseen instance): 0.63125
forest vote margin (unseen instance): 0.26249999999999996
confidence weighted forest vote share (unseen instance): 0.6962086734693407
confidence weighted forest vote margin (unseen instance): 0.3924173469386812

rule: chk_A14 True AND amt <= 7660.23034
rule cardinality: 2
Fraction of total points of rule: 0.10810247293527306
Fraction of total weight of rule: 0.089715766012196

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.7714285714285715
rule coverage (unseen data): 0.38373751783166904
rule xcoverage (unseen data): 0.38264580369843526
rule precision (unseen data): 0.9813432835820896
rule stability (unseen data): 0.974169741697417
rule recall (unseen data): 0.48703703703703705
rule f1 score (unseen data): 0.650990099009901
rule NP

INSTANCE RESULTS
instance id: 195 with true class label: 1 (bad)

Model Results for Instance
target (predicted) class: 0 (good)
target class prior (training data): 0.7714285714285715
forest vote share (unseen instance): 0.575625
forest vote margin (unseen instance): 0.15125000000000005
confidence weighted forest vote share (unseen instance): 0.5488346163776414
confidence weighted forest vote margin (unseen instance): 0.09766923275528189

rule: dur <= 13.14394
rule cardinality: 1
Fraction of total points of rule: 0.16211936505188063
Fraction of total weight of rule: 0.14945363773208165

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.7714285714285715
rule coverage (unseen data): 0.3780313837375178
rule xcoverage (unseen data): 0.37695590327169276
rule precision (unseen data): 0.8939393939393939
rule stability (unseen data): 0.8876404494382022
rule recall (unseen data): 0.43703703703703706
rule f1 score (unseen data): 0.5870646766169154
rule NPV (unseen data)

INSTANCE RESULTS
instance id: 327 with true class label: 0 (good)

Model Results for Instance
target (predicted) class: 0 (good)
target class prior (training data): 0.7714285714285715
forest vote share (unseen instance): 0.58
forest vote margin (unseen instance): 0.15999999999999998
confidence weighted forest vote share (unseen instance): 0.6432723737018423
confidence weighted forest vote margin (unseen instance): 0.2865447474036838

rule: svng_A64 True
rule cardinality: 1
Fraction of total points of rule: 0.07025762177877748
Fraction of total weight of rule: 0.05175484457960335

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.7714285714285715
rule coverage (unseen data): 0.052781740370898715
rule xcoverage (unseen data): 0.05263157894736842
rule precision (unseen data): 0.9722222222222222
rule stability (unseen data): 0.9230769230769231
rule recall (unseen data): 0.06481481481481481
rule f1 score (unseen data): 0.12152777777777778
rule NPV (unseen data): 0

INSTANCE RESULTS
instance id: 299 with true class label: 0 (good)

Model Results for Instance
target (predicted) class: 0 (good)
target class prior (training data): 0.7714285714285715
forest vote share (unseen instance): 0.621875
forest vote margin (unseen instance): 0.24374999999999997
confidence weighted forest vote share (unseen instance): 0.6984271327115645
confidence weighted forest vote margin (unseen instance): 0.3968542654231285

rule: svng_A64 True
rule cardinality: 1
Fraction of total points of rule: 0.07586760547388684
Fraction of total weight of rule: 0.052069916762445226

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.7714285714285715
rule coverage (unseen data): 0.052781740370898715
rule xcoverage (unseen data): 0.05263157894736842
rule precision (unseen data): 0.9722222222222222
rule stability (unseen data): 0.9230769230769231
rule recall (unseen data): 0.06481481481481481
rule f1 score (unseen data): 0.12152777777777778
rule NPV (unseen dat

INSTANCE RESULTS
instance id: 792 with true class label: 0 (good)

Model Results for Instance
target (predicted) class: 0 (good)
target class prior (training data): 0.7714285714285715
forest vote share (unseen instance): 0.644375
forest vote margin (unseen instance): 0.28875
confidence weighted forest vote share (unseen instance): 0.7399634287641512
confidence weighted forest vote margin (unseen instance): 0.4799268575283019

rule: dur <= 10.39796 AND crhis_A31 False
rule cardinality: 2
Fraction of total points of rule: 0.11425632276949316
Fraction of total weight of rule: 0.09396838426968764

Results - Reference Sample + Pruned Rule
target class prior (unseen data): 0.7714285714285715
rule coverage (unseen data): 0.17118402282453637
rule xcoverage (unseen data): 0.17069701280227595
rule precision (unseen data): 0.9831932773109243
rule stability (unseen data): 0.9672131147540983
rule recall (unseen data): 0.21666666666666667
rule f1 score (unseen data): 0.3550834597875569
rule NPV (uns