In [1]:
import sys
cpath = !pwd
sys.path.append(cpath[0][:-43]) 
sys.path.append(cpath[0][:-43]+'/algorithms')

In [2]:
from lionforests import LionForests
from algorithms.simpleSurrogate import GlobalSurrogateTree, LocalSurrogateTree
from algorithms.DefragTrees.defragTrees import DefragModel
from CHIRPS.structures import data_container
import CHIRPS.routines as rt
import CHIRPS.structures as strcts

from scipy import sparse

from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import f1_score, precision_score

from anchor import anchor_tabular

glass = Dataset()

In [3]:
X, y, feature_names, class_names = glass.load_glass()
y = [int(i) for i in y]
y = np.array(y)
#one_h_percent = int(min(10*len(X)/100,100))
#print("Instances:",one_h_percent)
new_fn = []
for i in feature_names:
    new_fn.append(i.replace(' ','_'))
feature_names = new_fn

In [54]:
def measure(X_train, X_test, y_train, y_test, feature_names, class_names, iterr):
    parameters = [{
        'max_depth': [10],
        'max_features': ['sqrt'],
        'bootstrap': [True],
        'min_samples_leaf' : [2],
        'n_estimators': [1000]
    }]
    lf = LionForests(None, False, None, feature_names, class_names)
    lf.fit(X_train, y_train, params=parameters)    
    train = lf.utilizer.transform(X_train)
    test = lf.utilizer.transform(X_test)

    predictions = lf.model.predict(train)
    test_predictions = lf.model.predict(test)
    
    def techniques(model, train, y_train, predictions, test, feature_names, class_names, lf, task):
        
        
        #BaselineTechnique ==============================================================================
        gt = GlobalSurrogateTree(train, predictions, feature_names, task)
        lt = LocalSurrogateTree(train, predictions, feature_names, task, 150)

        #DefragTechnique ================================================================================
        Kmax = 10
        splitter = DefragModel.parseSLtrees(model) # parse sklearn tree ensembles into the array of (feature index, threshold)
        mdl = DefragModel(modeltype=task, maxitr=100, qitr=0, tol=1e-6, restart=20, verbose=0, njobs=6)
        mdl.fit(train, predictions, splitter, Kmax, fittype='FAB', featurename=feature_names)
        def def_cov(instances):
            ts = time.time()
            score, cover, coll = mdl.evaluate(instances[1:],lf.model.predict(instances[1:]))
            def_predictions = mdl.predict(instances[1:])
            rule = mdl.find_rule_length(instances[0])
            te = time.time() - ts
            return rule, cover, def_predictions, te
        
        #Anchors =======================================================================================
        explainer = anchor_tabular.AnchorTabularExplainer(class_names, feature_names, train)
        def anchors_method(instance):
            exp = explainer.explain_instance(instance, lf.model.predict, threshold=0.95)
            anchors_dict = {}
            for i in exp.names():
                terms = i.split(' ')
                if len(terms) == 3:
                    anchors_dict[terms[0]] = [[terms[1],float(terms[2])]]
                else:
                    anchors_dict[terms[2]] = [[terms[3],float(terms[4])],[terms[1],float(terms[0])]]
            return anchors_dict, 0
        #"""
        #CHIRPS =======================================================================================
        project_dir = cpath[0][:-43]+'/algorithms/CHIRPS'
        temp_y = np.array([int(i+1) for i in y_train])
        temp_frame = pd.DataFrame(np.hstack((train,temp_y.reshape(len(temp_y),1))),columns=feature_names+['class'])
        temp_frame['class']=temp_frame['class'].astype(int)
        mydata = data_container(
                data = temp_frame, class_col = 'class', var_names = feature_names,
                project_dir = project_dir, save_dir = 'glass_'+str(iterr), random_state=123)
        meta_data = mydata.get_meta()
        f_walker = strcts.classification_trees_walker(forest=model, meta_data=meta_data)
        f_walker.forest_walk(instances = test, labels = model.predict(test), forest_walk_async = True)

        explanations = strcts.CHIRPS_container(f_walker.path_detail,
                                        forest=model,
                                        sample_instances=sparse.csr_matrix(train), # any representative sample can be used
                                        sample_labels=predictions,
                                        meta_data=meta_data)
        chts = time.time()
        explanations.run_explanations(target_classes=model.predict(test), # we're explaining the prediction, not the true label!
                                explanation_async=False,
                                random_state=123,
                                which_trees='majority',
                                alpha_paths=0.0,
                                support_paths=0.1,
                                score_func=1,
                                precis_threshold=0.99,
                                disc_path_bins=4,
                                merging_bootstraps=20,
                                pruning_bootstraps=20,
                                delta=0.2,
                                weighting='kldiv')
        chte = (time.time()-chts)/len(test)
        
        def chirps_method(idx):
            chirps_dict = {}
            for i in explanations.explainers[idx].pruned_rule:
                if i[1]:
                    chirps_dict[i[0]] = [['<=',float(i[2])]]
                else:
                    chirps_dict[i[0]] = [['>',float(i[2])]]
            return chirps_dict, 0, chte
        #"""
        #LionForests
        def lf_rule(instance):
            temp = lf.explain(instance, ar_algorithm='apriori', cl_algorithm='SC')[5]
            rule = {}
            for key,value in temp.items():
                rule[key] = [['<=',value[1]],['>',value[0]]]
            return rule

        return {'gs':gt.rule,'ls':lt.rule,'an':anchors_method,'lf':lf_rule, 'df': def_cov, 'ch':chirps_method}
        #return {'gs':gt.rule,'ls':lt.rule,'an':anchors_method,'lf':lf_rule, 'df': def_cov}
    interpretation = techniques(lf.model, train, y_train, predictions, test, feature_names, class_names, lf, 'classification')
    def rule_cov(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] <= rule[feature][1][1]:#THIS <=
                        #covered = False
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        #covered = False
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        #covered = False
                        return 0
        
        return 1
    def rule_cov_LF(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] < rule[feature][1][1]:#THIS <=
                        #covered = False
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        #covered = False
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        #covered = False
                        return 0
        
        return 1
    
    #loo = LeaveOneOut()
    #loo.get_n_splits(test)
    rule_generator = interpretation
    full_coverage = {'gs':0, 'ls':0, 'an':0, 'lf':0, 'df':0, 'ch': 0}
    rule_length = {'gs':0, 'ls':0, 'an':0, 'lf':0, 'df':0, 'ch': 0}
    f_precision = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch':[]}
    time_response = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch':[]}
    
    
    x_train_temp = train
    x_test_temp = test

    y_train_temp = predictions
    y_test_temp = test_predictions
    
    x_train_temp_lf = lf.utilizer.inverse_transform(x_train_temp)
    x_test_temp_lf = lf.utilizer.inverse_transform(x_test_temp)    
    
    for test_ind in range(len(test)):
        for name, method in rule_generator.items():
            if name == 'ch':
                rule, op, te = method(test_ind)
                coverage = 0
                precision = []
                co = 0
                for i in x_test_temp:
                    res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        precision.append([y_test_temp[test_ind], y_test_temp[co]])
                    co = co + 1
                if len(precision) > 1:
                    precision = np.array(precision)
                    f_precision[name].append(precision_score(precision[:,:1],precision[:,1:],average='micro'))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
            elif name == 'df':
                #FIX RULE LENGTH!
                rule, cover, predictions, te = method(np.concatenate((x_test_temp[test_ind:test_ind+1],x_test_temp)))
                f_precision[name].append(precision_score(predictions,y_test_temp,average='micro'))
                full_coverage[name] = full_coverage[name] + cover
            elif name == 'lf':
                ts = time.time()
                rule = method(x_test_temp_lf[test_ind])
                te = time.time() - ts
                coverage = 0
                precision = []
                co = 0
                for i in x_test_temp_lf:
                    res = rule_cov_LF(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        precision.append([y_test_temp[test_ind], y_test_temp[co]])
                    co = co + 1
                if len(precision) >= 1:
                    precision = np.array(precision)
                    f_precision[name].append(precision_score(precision[:,:1],precision[:,1:],average='micro'))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp_lf)
            else:
                ts = time.time()
                rule, prediction = method(x_test_temp[test_ind])
                te = time.time() - ts
                coverage = 0
                precision = []
                co = 0 
                for i in x_test_temp:
                    if name == 'anchors':
                        res = rule_cov(i, feature_names, rule)
                    else:    
                        res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1 and name=='anchors':
                        precision.append([y_test_temp[test_ind], y_test_temp[co]])
                    elif res == 1:
                        precision.append([prediction, y_test_temp[co]])
                    co = co + 1
                if len(precision) > 1:
                    precision = np.array(precision)
                    f_precision[name].append(precision_score(precision[:,:1],precision[:,1:],average='micro'))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
            time_response[name].append(te)
            #print(rule)
            rule_length[name] = rule_length[name] + len(rule)
    return rule_generator, full_coverage, rule_length, f_precision, time_response

In [34]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
total_results = []
kf = StratifiedKFold(n_splits=10, random_state=777)
folds = 0
test_size = []

In [35]:
for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('# of Fold: ' + str(folds+1) + ', size of test: ' + str(len(X_test)))    
    print(Counter(y_train), Counter(y_test))
    results = measure(X_train, X_test, y_train, y_test, feature_names, class_names, folds)
    test_size.append(len(X_test))
    total_results.append(results)
    folds=folds+1

# of Fold: 1, size of test: 22
Counter({1: 68, 0: 63, 5: 26, 2: 15, 3: 12, 4: 8}) Counter({1: 8, 0: 7, 5: 3, 2: 2, 3: 1, 4: 1})
len self.path_detail
22
Working on CHIRPS for instance 0 of 22
as_chirps for batch_idx 0
start mining for batch_idx 0 with support = 0.1
reduced 1 patterns out of 64 by numeric redundancy
found 63 patterns from 596 trees for batch_idx 0
start score sort for batch_idx 0 (63) patterns
start merge rule for batch_idx 0 (63) patterns
[('magnesium', False, 0.63623), ('potassium', True, 0.05979), ('refractive_index', False, 0.26203), ('calcium', True, 0.40274)]
0.71875 0.1275669687017766 0.15358075859633258 0.1985782722917632
merge complete for batch_idx 0 (63) patterns
start get explainer for batch_idx 0
as_chirps for batch_idx 1
start mining for batch_idx 1 with support = 0.1
found 54 patterns from 526 trees for batch_idx 1
start score sort for batch_idx 1 (54) patterns
start merge rule for batch_idx 1 (54) patterns
[('aluminum', True, 0.41159), ('refractive_index'

[('aluminum', False, 0.35004), ('aluminum', True, 0.40968), ('refractive_index', True, 0.26292)]
0.42857142857142855 0.039754327191513124 0.21633069123077986 0.2986683576410907
merge complete for batch_idx 16 (47) patterns
start get explainer for batch_idx 16
as_chirps for batch_idx 17
start mining for batch_idx 17 with support = 0.1
found 6 patterns from 336 trees for batch_idx 17
start score sort for batch_idx 17 (6) patterns
start merge rule for batch_idx 17 (6) patterns
[('barium', False, 0.0965)]
0.7575757575757576 0.1340437125385966 0.36470921836926157 0.4015904572564612
merge complete for batch_idx 17 (6) patterns
start get explainer for batch_idx 17
as_chirps for batch_idx 18
start mining for batch_idx 18 with support = 0.1
found 24 patterns from 553 trees for batch_idx 18
start score sort for batch_idx 18 (24) patterns
start merge rule for batch_idx 18 (24) patterns
[('aluminum', False, 0.41247), ('barium', True, 0.11164), ('potassium', True, 0.02206)]
0.42857142857142855 0.03

[('magnesium', False, 0.72697), ('refractive_index', True, 0.26715), ('calcium', True, 0.27187)]
0.7631578947368421 0.1568171435690851 0.32756731363903313 0.3809938971229294
merge complete for batch_idx 11 (30) patterns
start get explainer for batch_idx 11
as_chirps for batch_idx 12
start mining for batch_idx 12 with support = 0.1
found 30 patterns from 920 trees for batch_idx 12
start score sort for batch_idx 12 (30) patterns
start merge rule for batch_idx 12 (30) patterns
[('aluminum', False, 0.3481), ('potassium', False, 0.0976), ('magnesium', False, 0.51414), ('refractive_index', True, 0.26079)]
0.7333333333333333 0.11860957680966817 0.26239937837125094 0.2916850950906679
merge complete for batch_idx 12 (30) patterns
start get explainer for batch_idx 12
as_chirps for batch_idx 13
start mining for batch_idx 13 with support = 0.1
found 24 patterns from 382 trees for batch_idx 13
start score sort for batch_idx 13 (24) patterns
start merge rule for batch_idx 13 (24) patterns
[('aluminu

[('aluminum', False, 0.35123), ('magnesium', False, 0.75143)]
0.7692307692307693 0.16175137985007002 0.17907377625395327 0.21214071214071215
merge complete for batch_idx 6 (44) patterns
start get explainer for batch_idx 6
as_chirps for batch_idx 7
start mining for batch_idx 7 with support = 0.1
found 35 patterns from 535 trees for batch_idx 7
start score sort for batch_idx 7 (35) patterns
start merge rule for batch_idx 7 (35) patterns
[('aluminum', False, 0.35058), ('magnesium', False, 0.73552), ('calcium', True, 0.27525)]
0.8055555555555556 0.14951808221435042 0.32280795061998036 0.3865948533812089
merge complete for batch_idx 7 (35) patterns
start get explainer for batch_idx 7
as_chirps for batch_idx 8
start mining for batch_idx 8 with support = 0.1
found 33 patterns from 956 trees for batch_idx 8
start score sort for batch_idx 8 (33) patterns
start merge rule for batch_idx 8 (33) patterns
[('aluminum', False, 0.35379), ('refractive_index', True, 0.27365), ('magnesium', False, 0.5665

found 49 patterns from 897 trees for batch_idx 1
start score sort for batch_idx 1 (49) patterns
start merge rule for batch_idx 1 (49) patterns
[('magnesium', False, 0.58357), ('refractive_index', False, 0.2652), ('refractive_index', True, 0.30511), ('aluminum', True, 0.36643)]
0.7948717948717948 0.1631969370662838 0.2708929291598093 0.29199260418148204
merge complete for batch_idx 1 (49) patterns
start get explainer for batch_idx 1
as_chirps for batch_idx 2
start mining for batch_idx 2 with support = 0.1
found 49 patterns from 941 trees for batch_idx 2
start score sort for batch_idx 2 (49) patterns
start merge rule for batch_idx 2 (49) patterns
[('aluminum', True, 0.346), ('refractive_index', False, 0.2648), ('refractive_index', True, 0.30566)]
0.7692307692307693 0.1618808327351041 0.21733152441518502 0.27468742026027054
merge complete for batch_idx 2 (49) patterns
start get explainer for batch_idx 2
as_chirps for batch_idx 3
start mining for batch_idx 3 with support = 0.1
found 57 pat

[('magnesium', True, 0.54064), ('sodium', True, 0.42074)]
0.4583333333333333 0.08695410755434632 0.2920266803447511 0.2758521913491836
merge complete for batch_idx 17 (24) patterns
start get explainer for batch_idx 17
as_chirps for batch_idx 18
start mining for batch_idx 18 with support = 0.1
found 23 patterns from 686 trees for batch_idx 18
start score sort for batch_idx 18 (23) patterns
start merge rule for batch_idx 18 (23) patterns
[('aluminum', False, 0.32992), ('potassium', True, 0.02259), ('barium', True, 0.09171), ('magnesium', True, 0.5577)]
0.46153846153846156 0.03498470613939261 0.42068490674445846 0.45578695800614555
merge complete for batch_idx 18 (23) patterns
start get explainer for batch_idx 18
as_chirps for batch_idx 19
start mining for batch_idx 19 with support = 0.1
found 8 patterns from 988 trees for batch_idx 19
start score sort for batch_idx 19 (8) patterns
start merge rule for batch_idx 19 (8) patterns
[('barium', False, 0.11274)]
0.7272727272727273 0.13324105557

found 24 patterns from 752 trees for batch_idx 12
start score sort for batch_idx 12 (24) patterns
start merge rule for batch_idx 12 (24) patterns
[('aluminum', True, 0.46733)]
0.39156626506024095 0.1870967741935484 0.013090962062164322 0.03250075233222992
merge complete for batch_idx 12 (24) patterns
start get explainer for batch_idx 12
as_chirps for batch_idx 13
start mining for batch_idx 13 with support = 0.1
found 17 patterns from 347 trees for batch_idx 13
start score sort for batch_idx 13 (17) patterns
start merge rule for batch_idx 13 (17) patterns
[('magnesium', True, 0.56488), ('sodium', False, 0.47949), ('barium', True, 0.1178)]
0.5 0.039783783783783784 0.647686097246557 0.6290877796901894
merge complete for batch_idx 13 (17) patterns
start get explainer for batch_idx 13
as_chirps for batch_idx 14
start mining for batch_idx 14 with support = 0.1
found 18 patterns from 406 trees for batch_idx 14
start score sort for batch_idx 14 (18) patterns
start merge rule for batch_idx 14 (

[('barium', True, 0.13143)]
0.39655172413793105 0.1693548387096774 0.015519036139183331 0.027842227378190258
merge complete for batch_idx 7 (21) patterns
start get explainer for batch_idx 7
as_chirps for batch_idx 8
start mining for batch_idx 8 with support = 0.1
found 22 patterns from 519 trees for batch_idx 8
start score sort for batch_idx 8 (22) patterns
start merge rule for batch_idx 8 (22) patterns
[('aluminum', True, 0.46497)]
0.38650306748466257 0.1899193548387097 0.02058704083728897 0.03665158371040724
merge complete for batch_idx 8 (22) patterns
start get explainer for batch_idx 8
as_chirps for batch_idx 9
start mining for batch_idx 9 with support = 0.1
found 23 patterns from 692 trees for batch_idx 9
start score sort for batch_idx 9 (23) patterns
start merge rule for batch_idx 9 (23) patterns
[('sodium', True, 0.50397)]
0.4110429447852761 0.21524193548387097 0.02421638744646022 0.04976619906479626
merge complete for batch_idx 9 (23) patterns
start get explainer for batch_idx 

[('aluminum', True, 0.33926), ('magnesium', False, 0.68195), ('refractive_index', False, 0.26003), ('iron', True, 0.36824)]
0.75 0.283359375 0.2602400358823273 0.29950869236583527
merge complete for batch_idx 4 (64) patterns
start get explainer for batch_idx 4
Working on CHIRPS for instance 5 of 21
as_chirps for batch_idx 5
start mining for batch_idx 5 with support = 0.1
reduced 1 patterns out of 50 by numeric redundancy
found 49 patterns from 640 trees for batch_idx 5
start score sort for batch_idx 5 (49) patterns
start merge rule for batch_idx 5 (49) patterns
[('magnesium', False, 0.76135), ('refractive_index', False, 0.41688)]
0.6956521739130435 0.0843359375 0.28439578601904536 0.2402883460152182
merge complete for batch_idx 5 (49) patterns
start get explainer for batch_idx 5
as_chirps for batch_idx 6
start mining for batch_idx 6 with support = 0.1
found 53 patterns from 828 trees for batch_idx 6
start score sort for batch_idx 6 (53) patterns
start merge rule for batch_idx 6 (53) pa

ValueError: setting an array element with a sequence.

In [59]:
full_coverage = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch': []}
rule_length = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch': []}
f_precision = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch':[]}
f_time = {'gs':[], 'ls':[], 'an':[], 'lf':[], 'df':[], 'ch':[]}
k = 0
for i in total_results:
    for name, method in i[0].items():
        full_coverage[name].append(i[1][name]/test_size[k])
        rule_length[name].append(i[2][name]/test_size[k])
        f_precision[name].append(np.array(i[3][name]).mean())
        f_time[name].append(np.array(i[4][name]).mean())
    k = + 1
for name, method in total_results[0][0].items():
    print(name,np.array(full_coverage[name]).mean(),',',
          np.array(rule_length[name]).mean(),',',
          np.array(f_precision[name]).mean(),',',
          np.array(f_time[name]).mean())

gs 0.1314377541650269 , 4.127272727272727 , 0.782446842373313 , 0.00015431926363990422
ls 0.1463662600026236 , 3.713636363636364 , 0.8011129545835427 , 2.1776219859439747
an 0.0909058113603568 , 5.15 , 0.4306231361231361 , 39.86219572069668
lf 0.043952512134330324 , 8.149999999999999 , 1.0 , 2.8186567497631865
df 0.9272727272727274 , 2.5045454545454544 , 0.5196248196248197 , 0.23814570038521615
ch 0.2848100672775998 , 2.75974025974026 , 0.8417211100750307 , 0.500580770063587
