In [1]:
import sys
cpath = !pwd
sys.path.append(cpath[0][:-43]) 
sys.path.append(cpath[0][:-43]+'/algorithms')

In [2]:
from lionforests import LionForests
from algorithms.simpleSurrogate import GlobalSurrogateTree, LocalSurrogateTree
from algorithms.DefragTrees.defragTrees import DefragModel

from scipy import sparse

from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import f1_score, precision_score, mean_absolute_error

In [3]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
data = load_boston()
X = data['data']
y = data['target']
feature_names = list(data['feature_names'])
class_names=["House Price"]

In [4]:
#one_h_percent = int(min(10*len(X)/100,100))
#print("Instances:",one_h_percent)

In [26]:
def measure(X_train, X_test, y_train, y_test, feature_names, class_names):
    parameters = [{
        'max_depth': [10], #10
        'max_features': [0.75],
        'bootstrap': [True],
        'min_samples_leaf' : [1],
        'n_estimators': [1000]
    }]
    lf = LionForests(None, False, None, feature_names, class_names)
    lf.fit(X_train, y_train, params=parameters)    
    train = lf.utilizer.transform(X_train)
    test = lf.utilizer.transform(X_test)

    predictions = lf.model.predict(train)
    test_predictions = lf.model.predict(test)

    def techniques(model, train, y_train, predictions, test, feature_names, class_names, lf, task):

        #BaselineTechnique ==============================================================================
        gt = GlobalSurrogateTree(train, predictions, feature_names, task)
        lt = LocalSurrogateTree(train, predictions, feature_names, task, 150)
        #print('    GT and LT Trained')
        #DefragTechnique ================================================================================
        Kmax = 15
        splitter = DefragModel.parseSLtrees(model) # parse sklearn tree ensembles into the array of (feature index, threshold)
        mdl = DefragModel(modeltype=task, maxitr=15, qitr=0, tol=1e-6, restart=2, verbose=0, njobs=7)
        mdl.fit(train, predictions, splitter, Kmax, fittype='FAB', featurename=feature_names)
        def def_cov(instances):
            ts = time.time()
            score, cover, coll = mdl.evaluate(instances[1:],lf.model.predict(instances[1:]))
            def_predictions = mdl.predict(instances[1:])
            rule = mdl.find_rule_length(instances[0])
            return rule, cover, def_predictions, time.time()-ts
        #print('    Defrag Trained')

        #LionForests
        def lf_rule(instance):
            temp = lf.explain(instance, instance_qe=5.9, method='R1')
            error = temp[-1]
            rule = {}
            for key,value in temp[5].items():
                rule[key] = [['<=',value[1]],['>',value[0]]]
            return rule, error
        print('    LF Ready')

        return {'gs':gt.rule,'ls':lt.rule,'lf':lf_rule, 'df': def_cov}        
        
    interpretation = techniques(lf.model, train, y_train, predictions, test, feature_names, class_names, lf, 'regression')
    def rule_cov(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] <= rule[feature][1][1]:#THIS <=
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
        return 1
    def rule_cov_LF(instance, feature_names, rule):
        covered = True
        for k in range(len(instance)):
            feature = feature_names[k]
            if feature in rule.keys():
                if len(rule[feature]) == 2:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
                    if instance[k] < rule[feature][1][1]:#THIS <=
                        return 0
                elif rule[feature][0][0] == '>':
                    if instance[k] <= rule[feature][0][1]:
                        return 0
                else:
                    if instance[k] > rule[feature][0][1]: #<=
                        return 0
        return 1


    rule_generator = interpretation
    full_coverage = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    rule_length = {'gs':0, 'ls':0, 'lf':0, 'df':0}
    f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
    time_response = {'gs':[], 'ls':[], 'lf':[], 'df':[]}

    x_train_temp = train
    x_test_temp = test

    y_train_temp = predictions
    y_test_temp = test_predictions
    
    x_train_temp_lf = lf.utilizer.inverse_transform(x_train_temp)
    x_test_temp_lf = lf.utilizer.inverse_transform(x_test_temp)   

    for test_ind in range(len(test)):
        for name, method in rule_generator.items():
            if name == 'df':
                #FIX RULE LENGTH!
                rule, cover, predictions, te = method(np.concatenate((x_test_temp[test_ind:test_ind+1],x_test_temp)))
                f_mae[name].append(mean_absolute_error(predictions,y_test_temp))
                full_coverage[name] = full_coverage[name] + cover
            elif name == 'lf':
                ts = time.time()
                rule, error = method(x_test_temp_lf[test_ind])
                te = time.time() - ts
                coverage = 0
                mae = []
                co = 0
                for i in x_test_temp_lf:
                    res = rule_cov_LF(i, feature_names, rule)
                    coverage = coverage + res
                    if res == 1:
                        if str(error) != 'nan' and str(error) != 'None':
                            mae.append(error)
                    co = co + 1
                if len(mae) >= 1:
                    mae = np.array(mae)
                    f_mae[name].append(mae.mean())
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp_lf)
            else:
                ts = time.time()
                rule, prediction = method(x_test_temp[test_ind])
                te = time.time() - ts
                coverage = 0
                error = []
                co = 0 
                for i in x_test_temp:
                    res = rule_cov(i, feature_names, rule)
                    coverage = coverage + res
                    error.append([prediction, y_test_temp[co]])
                    co = co + 1
                if len(error) >= 1:
                    error = np.array(error)
                    f_mae[name].append(mean_absolute_error(error[:,:1],error[:,1:]))
                full_coverage[name] = full_coverage[name] + coverage/len(x_test_temp)
            time_response[name].append(te)
            rule_length[name] = rule_length[name] + len(rule)
    return rule_generator, full_coverage, rule_length, f_mae, time_response

In [27]:
from sklearn.model_selection import KFold
total_results = []
kf = KFold(n_splits=10, random_state=77)
folds = 0
test_size = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    test_size.append(len(X_test))
    print('# of Fold: ' + str(folds+1) + ', size of test: ' + str(len(X_test)))    
    results = measure(X_train, X_test, y_train, y_test, feature_names, class_names)
    folds=folds+1
    total_results.append(results)

# of Fold: 1, size of test: 51
    LF Ready
# of Fold: 2, size of test: 51
    LF Ready
# of Fold: 3, size of test: 51
    LF Ready
# of Fold: 4, size of test: 51
    LF Ready
# of Fold: 5, size of test: 51
    LF Ready
# of Fold: 6, size of test: 51
    LF Ready
# of Fold: 7, size of test: 50
    LF Ready
# of Fold: 8, size of test: 50
    LF Ready
# of Fold: 9, size of test: 50
    LF Ready
# of Fold: 10, size of test: 50
    LF Ready


In [28]:
full_coverage = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
rule_length = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_mae = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
f_time = {'gs':[], 'ls':[], 'lf':[], 'df':[]}
k = 0
for i in total_results:
    for name, method in i[0].items():
        full_coverage[name].append(i[1][name]/test_size[k])
        rule_length[name].append(i[2][name]/test_size[k])
        f_mae[name].append(np.array(i[3][name]).mean())
        f_time[name].append(np.array(i[4][name]).mean())
    k = + 1
for name, method in i[0].items():
    print(name,np.array(full_coverage[name]).mean(),',',
          np.array(rule_length[name]).mean(),',',
          np.array(f_mae[name]).mean(),',',
          np.array(f_time[name]).mean())

gs 0.09432218377547097 , 4.315686274509804 , 6.065150799921204 , 0.00019103739308375938
ls 0.0923429450211457 , 4.141176470588235 , 6.0572074419406965 , 2.743038597686618
lf 0.018361399461745496 , 10.545098039215686 , 5.643175438989966 , 0.8673101750542136
df 0.9882352941176471 , 0.5823529411764705 , 5.811773819718651 , 0.9181597727700777
