In [1]:
import sys
cpath = !pwd
sys.path.append('/usr/src/app/algorithms/')
sys.path.append('/usr/src/app/')

In [2]:
from lionforests import LionForests
from algorithms.simpleSurrogate import GlobalSurrogateTree, LocalSurrogateTree
from algorithms.DefragTrees.defragTrees import DefragModel
from CHIRPS.structures import data_container
import CHIRPS.routines as rt
import CHIRPS.structures as strcts
from anchor.anchor_tabular import AnchorTabularExplainer

from scipy import sparse

from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import f1_score, precision_score, mean_absolute_error


from sklearn.utils._testing import ignore_warnings 
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning 

In [3]:
abalone = Dataset()
X, y, feature_names, class_names = abalone.load_abalone(type='regression')

In [4]:
#one_h_percent = int(min(10*len(X)/100,100))
#print("Instances:",one_h_percent)

In [11]:
def measure(X_train, X_test, y_train, y_test, feature_names, class_names, random_state=10):
    parameters = [{
            'max_depth': [5],#10
            'max_features': [None],
            'bootstrap': [True],
            'min_samples_leaf' : [5],
            'n_estimators': [1000]
    }]
    lf = LionForests(None, False, None, feature_names, class_names)
    lf.fit(X_train, y_train, params=parameters)    
    train = lf.utilizer.transform(X_train)
    test = lf.utilizer.transform(X_test)

    predictions = lf.model.predict(train)
    test_predictions = lf.model.predict(test)

    def techniques(model, train, y_train, predictions, test, feature_names, class_names, lf, task, random_state=10):

        #BaselineTechnique ==============================================================================
        print('Prepare GS')
        gt = GlobalSurrogateTree(train, predictions, feature_names, task, random_state)
        print('Prepare LS')
        lt = LocalSurrogateTree(train, predictions, feature_names, task, 150, random_state)
        #print('    GT and LT Trained')
        #DefragTechnique ================================================================================
        print('Prepare DF')
        

        Kmax = 10
        splitter = DefragModel.parseSLtrees(model) # parse sklearn tree ensembles into the array of (feature index, threshold)

        mdl = DefragModel(modeltype=task, maxitr=10, qitr=0, tol=1e-6, restart=2, verbose=0, njobs=1, seed=random_state)
        mdl.fit(train, predictions, splitter, Kmax, fittype='FAB', featurename=feature_names)
        def def_cov(instances):
            ts = time.time()
            score, cover, coll = mdl.evaluate(instances[1:],lf.model.predict(instances[1:]))
            def_predictions = mdl.predict(instances[1:])
            length, nodes = mdl.find_rule_length(instances[0])
            max_len = len(length)
            comp = {1:'>',0:'<='}
            rules = {}
            for f in feature_names:
                rules[f] = []
            counter = 0
            for rule, node in list(zip(mdl.rule_,nodes)):
                for conj in range(len(node)):
                    if node[conj] and counter<=max_len-1:
                        rules[feature_names[int(rule[conj][0]-1)]].append([comp[int(rule[conj][1])],rule[conj][2]])
                        counter += 1
            new_rules = {}
            for k, v in rules.items():
                if len(v) == 1:
                    new_rules[k] = v
                else:
                    mmin = None
                    mmax = None
                    for value in v:
                        if value[0]=='<=':
                            if mmin is None or mmin > value[1]:
                                mmin = value[1]
                        if value[0]=='>':
                            if mmax is None or mmax < value[1]:
                                mmax = value[1]
                    if mmin is not None:
                        new_rules[k] = [['<=', mmin]]
                    if mmax is not None:
                        new_rules[k] = [['>', mmax]]
            te = time.time()
            #print(mdl)
            return length, cover, def_predictions, te-ts, new_rules
        #print('    Defrag Trained')
        print('Prepare LF')
        #LionForests
        def lf_rule(instance):
            temp = lf.explain(instance, instance_qe=1, method='R1', instance_random_state=random_state)
            error = temp[-1]
            rule = {}
            for key,value in temp[5].items():
                rule[key] = [['<=',value[1]],['>',value[0]]]
            return rule, error
        #print('    LF Ready')
        
        return {'gs':gt.rule,'ls':lt.rule,'lf':lf_rule, 'df': def_cov}
        
    interpretation = techniques(lf.model, train, y_train, predictions, test, feature_names, class_names, lf, 'regression', random_state)
    def rule_cov(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] <= rule[feature][1][1]:#THIS <=
                        #covered = False
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        #covered = False
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        #covered = False
                        return 0
        
        return 1
    def rule_cov_LF(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] < rule[feature][1][1]:#THIS <=
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
        return 1

    rule_generator = interpretation
    full_coverage = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    rule_length = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    time_response = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    rules = {'gs':[], 'ls':[], 'lf':[], 'df':[]}

    #full_coverage = {'lf':0}
    #rule_length = {'lf':0}
    #f_precision = {'lf':[]}
    #time_response = {'lf':[]}
    
    x_train_temp = train
    x_test_temp = test

    y_train_temp = predictions
    y_test_temp = test_predictions
    
    x_train_temp_lf = lf.utilizer.inverse_transform(x_train_temp)
    x_test_temp_lf = lf.utilizer.inverse_transform(x_test_temp)  
    
    ktime = time.time()
    for test_ind in range(len(test)):
        clear_output()
        if test_ind % 100 ==0:
            print(round(test_ind/(len(test))*100,2),'in:', time.time()-ktime)
            ktime = time.time()

        for name, method in rule_generator.items():
            print(name)
            if name == 'df':
                #FIX RULE LENGTH!
                rule, cover, predictions, te, new_rules = method(np.concatenate((x_test_temp[test_ind:test_ind+1],x_test_temp)))
                f_mae[name].append(mean_absolute_error(predictions,y_test_temp))
                full_coverage[name] = full_coverage[name] + cover
                rules['df'].append(new_rules)
            elif name == 'lf':
                ts = time.time()
                rule, error = method(x_test_temp_lf[test_ind])
                te = time.time() - ts
                coverage = 0
                mae = []
                co = 0
                for i in x_test_temp_lf:
                    res = rule_cov_LF(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        if str(error) != 'nan' and str(error) != 'None':
                            mae.append(error)
                    co = co + 1
                if len(mae) >= 1:
                    #print(mae)
                    mae = np.array(mae)
                    f_mae[name].append(mae.mean())
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp_lf)
                rules['lf'].append(rule)
            else:
                ts = time.time()
                rule, prediction = method(x_test_temp[test_ind])
                te = time.time()-ts
                coverage = 0
                error = []
                co = 0 
                for i in x_test_temp:
                    res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    error.append([prediction, y_test_temp[co]])
                    co = co + 1
                if len(error) >= 1:
                    error = np.array(error)
                    f_mae[name].append(mean_absolute_error(error[:,:1],error[:,1:]))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
                rules[name].append(rule)
            time_response[name].append(te)
            rule_length[name] = rule_length[name] + len(rule)
    return rule_generator, full_coverage, rule_length, f_mae, time_response, rules

In [12]:
from IPython.display import clear_output
total_results2 = []
test_size_2 = []
for rand in [7, 10, 77]: #7
    total_results = []
    test_size = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)
    results = measure(X_train, X_test, y_train, y_test, feature_names, class_names, random_state=rand)
    total_results.append(results)
    test_size.append(len(X_test))
    test_size_2.append(test_size)
    total_results2.append(total_results)
    clear_output()

In [16]:
from utilities.lionforests_utility import path_similarity
rule_variance = {'gs':[], 'ls':[],'lf':[], 'df':[]}


folds = 0
test_size = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

min_max_feature_values = {}
for i in range(len(feature_names)):
        min_max_feature_values[feature_names[i]] = [min(X_train[:, i]), max(X_train[:, i])] 
for name in ['gs', 'ls', 'lf', 'df']:
    for k in range(len(total_results2[0][folds][-1][name])):
        r1 = total_results2[0][0][-1][name][k]
        r2 = total_results2[1][0][-1][name][k]
        r3 = total_results2[2][0][-1][name][k]
        #try:
        rule_variance[name].append((path_similarity(r1, r2, feature_names, min_max_feature_values)+
                                       path_similarity(r1, r3, feature_names, min_max_feature_values)+
                                       path_similarity(r2, r3, feature_names, min_max_feature_values))/3)

In [17]:
f_full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_f_precision = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
for b in range(3):
    full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_precision = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    k = 0
    for i in total_results2[b]:
        for name, method in i[0].items():
            full_coverage[name].append(i[1][name]/test_size_2[b][k])
            rule_length[name].append(i[2][name]/test_size_2[b][k])
            f_precision[name].append(np.array(i[3][name]).mean())
            f_time[name].append(np.array(i[4][name]).mean())
        k = + 1
    for name, method in i[0].items():
        f_full_coverage[name].append(np.array(full_coverage[name]).mean())
        f_rule_length[name].append(np.array(rule_length[name]).mean())
        f_f_precision[name].append(np.array(f_precision[name]).mean())
        f_f_time[name].append(np.array(f_time[name]).mean())
for name, method in i[0].items():
    print(name,  '| %5.4f  %5.3f | %5.4f %5.3f | %5.4f  %5.3f | %5.4f  %5.3f | %5.4f  %5.3f' 
          % (np.array(f_full_coverage[name]).mean(),np.array(f_full_coverage[name]).std(),
             np.array(f_rule_length[name]).mean(),np.array(f_rule_length[name]).std(),
             np.array(f_f_precision[name]).mean(),np.array(f_f_precision[name]).std(),
             np.array(f_f_time[name]).mean(),np.array(f_f_time[name]).std(),
             np.array(rule_variance[name]).mean(),np.array(rule_variance[name]).std()))

gs | 0.0019  0.000 | 4.7659 0.304 | 2.2717  0.000 | 0.0005  0.000 | 0.7240  0.136
ls | 0.0038  0.000 | 3.8768 0.031 | 2.2701  0.003 | 3.3311  0.035 | 0.7255  0.152
lf | 0.0013  0.000 | 4.9268 0.000 | 0.7759  0.000 | 1.2344  0.001 | 1.0000  0.000
df | 0.9942  0.005 | 11.1981 0.488 | 0.9533  0.017 | 1.3334  0.058 | 0.5784  0.077


In [18]:
import csv  

with open('abalone r.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    for name, method in i[0].items():
        writer.writerow([name, np.array(f_full_coverage[name]).mean(),np.array(f_full_coverage[name]).std(),
             np.array(f_rule_length[name]).mean(),np.array(f_rule_length[name]).std(),
             np.array(f_f_precision[name]).mean(),np.array(f_f_precision[name]).std(),
             np.array(f_f_time[name]).mean(),np.array(f_f_time[name]).std(),
             np.array(rule_variance[name]).mean(),np.array(rule_variance[name]).std()])