In [1]:
import sys
cpath = !pwd
sys.path.append(cpath[0][:-43]) 
sys.path.append(cpath[0][:-43]+'/algorithms')

In [2]:
from lionforests import LionForests
from algorithms.simpleSurrogate import GlobalSurrogateTree, LocalSurrogateTree
from algorithms.DefragTrees.defragTrees import DefragModel
from CHIRPS.structures import data_container
import CHIRPS.routines as rt
import CHIRPS.structures as strcts

from scipy import sparse

from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import f1_score, precision_score, mean_absolute_error

from anchor import anchor_tabular

In [3]:
abalone = Dataset()
X, y, feature_names, class_names = abalone.load_abalone(type='regression')

In [4]:
#one_h_percent = int(min(10*len(X)/100,100))
#print("Instances:",one_h_percent)

In [25]:
def measure(X_train, X_test, y_train, y_test, feature_names, class_names):
    parameters = [{
            'max_depth': [5],#10
            'max_features': [None],
            'bootstrap': [True],
            'min_samples_leaf' : [5],
            'n_estimators': [1000]
    }]
    lf = LionForests(None, False, None, feature_names, class_names)
    lf.fit(X_train, y_train, params=parameters)    
    train = lf.utilizer.transform(X_train)
    test = lf.utilizer.transform(X_test)

    predictions = lf.model.predict(train)
    test_predictions = lf.model.predict(test)

    def techniques(model, train, y_train, predictions, test, feature_names, class_names, lf, task):

        #BaselineTechnique ==============================================================================
        gt = GlobalSurrogateTree(train, predictions, feature_names, task)
        lt = LocalSurrogateTree(train, predictions, feature_names, task, 150)
        #print('    GT and LT Trained')
        #DefragTechnique ================================================================================
        Kmax = 10
        splitter = DefragModel.parseSLtrees(model) # parse sklearn tree ensembles into the array of (feature index, threshold)
        mdl = DefragModel(modeltype=task, maxitr=10, qitr=0, tol=1e-6, restart=2, verbose=0, njobs=7)
        mdl.fit(train, predictions, splitter, Kmax, fittype='FAB', featurename=feature_names)
        def def_cov(instances):
            ts = time.time()
            score, cover, coll = mdl.evaluate(instances[1:],lf.model.predict(instances[1:]))
            def_predictions = mdl.predict(instances[1:])
            rule = mdl.find_rule_length(instances[0])
            return rule, cover, def_predictions, time.time() - ts
        #print('    Defrag Trained')

        #LionForests
        def lf_rule(instance):
            temp = lf.explain(instance, instance_qe=1, method='R1')
            error = temp[-1]
            rule = {}
            for key,value in temp[5].items():
                rule[key] = [['<=',value[1]],['>',value[0]]]
            return rule, error
        #print('    LF Ready')
        
        return {'gs':gt.rule,'ls':lt.rule,'lf':lf_rule, 'df': def_cov}
        
        
    interpretation = techniques(lf.model, train, y_train, predictions, test, feature_names, class_names, lf, 'regression')
    def rule_cov(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] <= rule[feature][1][1]:#THIS <=
                        #covered = False
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        #covered = False
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        #covered = False
                        return 0
        
        return 1
    def rule_cov_LF(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] < rule[feature][1][1]:#THIS <=
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
        return 1

    rule_generator = interpretation
    full_coverage = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    rule_length = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    time_response = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    #full_coverage = {'lf':0}
    #rule_length = {'lf':0}
    #f_precision = {'lf':[]}
    #time_response = {'lf':[]}
    
    x_train_temp = train
    x_test_temp = test

    y_train_temp = predictions
    y_test_temp = test_predictions
    
    x_train_temp_lf = lf.utilizer.inverse_transform(x_train_temp)
    x_test_temp_lf = lf.utilizer.inverse_transform(x_test_temp)  
    
    ktime = time.time()
    for test_ind in range(len(test)):
        if test_ind % 100 ==0:
            print(round(test_ind/(len(test))*100,2),'in:', time.time()-ktime)
            ktime = time.time()

        for name, method in rule_generator.items():
            if name == 'df':
                #FIX RULE LENGTH!
                rule, cover, predictions, te = method(np.concatenate((x_test_temp[test_ind:test_ind+1],x_test_temp)))
                f_mae[name].append(mean_absolute_error(predictions,y_test_temp))
                full_coverage[name] = full_coverage[name] + cover
            elif name == 'lf':
                ts = time.time()
                rule, error = method(x_test_temp_lf[test_ind])
                te = time.time() - ts
                coverage = 0
                mae = []
                co = 0
                for i in x_test_temp_lf:
                    res = rule_cov_LF(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        if str(error) != 'nan' and str(error) != 'None':
                            mae.append(error)
                    co = co + 1
                if len(mae) >= 1:
                    #print(mae)
                    mae = np.array(mae)
                    f_mae[name].append(mae.mean())
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp_lf)
            else:
                ts = time.time()
                rule, prediction = method(x_test_temp[test_ind])
                te = time.time()-ts
                coverage = 0
                error = []
                co = 0 
                for i in x_test_temp:
                    res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    error.append([prediction, y_test_temp[co]])
                    co = co + 1
                if len(error) >= 1:
                    error = np.array(error)
                    f_mae[name].append(mean_absolute_error(error[:,:1],error[:,1:]))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
            time_response[name].append(te)
            rule_length[name] = rule_length[name] + len(rule)
    return rule_generator, full_coverage, rule_length, f_mae, time_response

In [26]:
def print_current(total_results):
    full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}

    for i in total_results:
        for name, method in i[0].items():
            full_coverage[name].append(i[1][name]/len(i[4][name]))
            rule_length[name].append(i[2][name]/len(i[4][name]))
            if len(i[3][name]) >= 1:
                f_mae[name].append(np.array(i[3][name]))                
            f_time[name].append(np.array(i[4][name]))
    for name, method in i[0].items():
        print(name,np.array(full_coverage[name]).mean(),',',
              np.array(rule_length[name]).mean(),',',
              np.array(f_mae[name]).mean(),',',
              np.array(f_time[name]).mean())

In [27]:
total_results = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)
results = measure(X_train, X_test, y_train, y_test, feature_names, class_names)
total_results.append(results)
total_results.append(results)
print_current(total_results)

0.0 in: 0.0
12.41 in: 457.6901707649231
24.81 in: 460.0012035369873
37.22 in: 455.0652394294739
49.63 in: 475.29728078842163
62.03 in: 464.4246275424957
74.44 in: 476.98477697372437
86.85 in: 457.24604177474976
99.26 in: 475.82029151916504
gs 0.0021704462191134695 , 4.336228287841191 , 2.2712562637024156 , 0.00029073400473772146
ls 0.0036743653368963764 , 3.9168734491315136 , 2.2666611517846578 , 2.6961218535752214
lf 0.0013299755555418605 , 4.925558312655087 , 0.7759244648299498 , 0.8281060655418756
df 0.99627791563274 , 10.405707196029777 , 0.9631773486581872 , 1.1215068693492312
