# TESTING

In [None]:
!unzip Marlena.zip

In [73]:
from XMTR import MTR
from GlobalLocalVariants import GlobalSurrogateTree, LocalSurrogateTree
from Marlena.algorithms.MARLENA.marlena.marlena.marlena import MARLENA
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
import warnings
warnings.simplefilter(action='ignore')
import time
import csv

def calc_al_error(instance, perc):
  # the error should be non zero
  return (instance+0.1)*perc 


def rule_cov(instance, feature_names, rule):
  covered = True
  for k in range(len(instance)):
    feature = feature_names[k]
    if feature in rule.keys():
      if type(rule[feature][0]) == list: # for GS/LS
        for lst in rule[feature]:
          if lst[0] == '>' and instance[k] <= lst[1]:
            return 0
          if lst[0] == '<=' and instance[k] > lst[1]:
            return 0
      else: # if it comes from MTR
          if instance[k] > rule[feature][1]:  # 1=max
              return 0
          if instance[k] < rule[feature][0]:  # 0=min
              return 0
  return 1


def calcMae(actualPred, MTRpred, GSpred, LSpred, MARLENApreds):
  # mae MTR local error
  MTRerrors = np.array([subarray[:,1] for subarray in MTRpred])
  MTRpreds = np.array([subarray[:,0] for subarray in MTRpred])
  column_errors = np.mean(MTRerrors, axis=0)
  maeActual_with_error = np.mean(MTRerrors, axis=0)

  # mae MTR/GS without local error
  maeGS = mean_absolute_error(GSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/LS without local error
  maeLS = mean_absolute_error(LSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/MARLENA without local error
  maeMAR = mean_absolute_error(MARLENApreds, MTRpreds, multioutput="raw_values")

  return [maeActual_with_error, maeGS, maeLS, maeMAR]

def doTest(X_train, X_test, y_train, y_test, f_n, t_n, percentage): 
  # train models
  print('   training MTR...') 
  MTR_obj = MTR(model=None, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
  model = MTR_obj.getModel()
  predictions = model.predict(X_train)

  print('   training GS...') 
  GS = GlobalSurrogateTree(X_train, predictions, f_n)
  print('   training LS...') 
  LS = LocalSurrogateTree(X_train, predictions, f_n, 20) # neigns should be >= 10
  print('   training Marlena...')
  marlena = MARLENA(neigh_type='mixed', random_state=42)


  actualpreds = []
  MTRpreds = []
  GSpreds = []
  LSpreds = []
  MARLENApreds = []

  time_response = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
  avgEstimators = []
  coverage = np.array([0,0,0,0])
  avgRuleLengths = np.array([0,0,0,0])
  for i in range(len(X_test)):
    print("   ",i+1,"/", len(X_test), "tests")
    inside_coverage = np.array([0,0,0,0])
    instance = X_test[i]
    # actual
    actualpreds.append(y_test[i])

    # MTR
    error = calc_al_error(y_test[i], percentage)
    ts = time.time()
    MTRrule = MTR_obj.explain(instance, error) # explain instance
    te = time.time() - ts
    time_response['MTR'].append(te)
    estimators = MTR_obj.getAmountOfReduction() # get estimators
    avgEstimators.append(estimators[0])
    decisionsAndErrors = MTR_obj.getDecisionsAndErros() # get preds/errors
    MTRpreds.append(decisionsAndErrors)
    feature_limits = MTR_obj.getFeatureLimits()
    avgRuleLengths[0] += len(feature_limits.keys())

    # GS 
    ts = time.time()
    GSrule, GSprediction = GS.rule(instance)
    te = time.time() - ts
    time_response['GS'].append(te)
    GSpreds.append(GSprediction)
    avgRuleLengths[1] += len(GSrule.keys())

    # LS
    ts = time.time()
    LSrule, LSprediction = LS.rule(instance)
    te = time.time() - ts
    time_response['LS'].append(te)
    LSpreds.append(LSprediction)
    avgRuleLengths[2] += len(LSrule.keys())

    # MARLENA
    i2e = pd.Series(instance, index=f_n)
    X2E = pd.DataFrame(X_train, columns=f_n)
    ts = time.time()
    # returns rule, mask(MarlenaPrediction), list_split_conditions, len_rule, instance_imporant_feat, fidelity, hit, DT
    _, MarlenaPrediction, list_split_conditions, len_rule, _, _, _, _ = marlena.extract_explanation(i2e, X2E, model, f_n, [],
                                              t_n, k=10, size=50, alpha=0.7)
    te = time.time() - ts
    time_response['MARLENA'].append(te)
    MARLENApreds.append(MarlenaPrediction)
    avgRuleLengths[3] += len_rule #len(list_split_conditions.keys())

    # calculate the coverage
    for test_instance in X_test:
      MTRcov = rule_cov(test_instance, f_n, feature_limits)
      GScov = rule_cov(test_instance, f_n, GSrule) 
      LScov = rule_cov(test_instance, f_n, LSrule) 
      MARcov = rule_cov(test_instance, f_n, list_split_conditions) 
      inside_coverage[0] += MTRcov
      inside_coverage[1] += GScov
      inside_coverage[2] += LScov
      inside_coverage[3] += MARcov
    coverage = np.add(coverage, inside_coverage/len(X_test))

  actualpreds = np.array(actualpreds)
  MTRpreds = np.array(MTRpreds)
  GSpreds = np.array(GSpreds)
  LSpreds = np.array(LSpreds)
  MARLENApreds = np.array(MARLENApreds)

  coverage = coverage/len(X_test)

  avgRuleLengths = avgRuleLengths/len(X_test)
  maeResults = calcMae(actualpreds, MTRpreds, GSpreds, LSpreds, MARLENApreds)

  #print('MTR|', "mae: ", maeResults[0].round(3), "| ruleL:", avgRuleLengths[0], "| TIME:", np.array(time_response['MTR']).mean(), "| Coverage:",coverage[0], "| avg estimators:", round(np.array(avgEstimators).mean(),3),"/",estimators[1])
  #print(' GS|', "mae: ", maeResults[1].round(3), "| ruleL:", avgRuleLengths[1], "| TIME:", np.array(time_response['GS']).mean(), "| Coverage:",coverage[1])
  #print(' LS|', "mae: ", maeResults[2].round(3), "| ruleL:", avgRuleLengths[2], "| TIME:", np.array(time_response['LS']).mean(), "| Coverage:",coverage[2])
  #print('MAR|', "mae: ", maeResults[3].round(3), "| ruleL:", avgRuleLengths[3], "| TIME:", np.array(time_response['MARLENA']).mean(), "| Coverage:",coverage[3])
  return maeResults, avgRuleLengths, time_response, coverage, np.array(avgEstimators).mean(), estimators[1]

In [74]:
######
# only needs X,y, f_n, t_n and allowed error
######
maeResults_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
avgRuleLengths_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
time_response_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
coverage_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
avgEstimators_all = {'reduced': [], 'original': []}

kf = KFold(n_splits=15, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  results = doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.1)

  # mae 
  maeResults_all['MTR'].append(results[0][0])
  maeResults_all['GS'].append(results[0][1])
  maeResults_all['LS'].append(results[0][2])
  maeResults_all['MARLENA'].append(results[0][3])

  # RL
  avgRuleLengths_all['MTR'].append(results[1][0])
  avgRuleLengths_all['GS'].append(results[1][1])
  avgRuleLengths_all['LS'].append(results[1][2])
  avgRuleLengths_all['MARLENA'].append(results[1][3])

  # time
  time_response_all['MTR'].append(np.array(results[2]['MTR']).mean())
  time_response_all['GS'].append(np.array(results[2]['GS']).mean())
  time_response_all['LS'].append(np.array(results[2]['LS']).mean())
  time_response_all['MARLENA'].append(np.array(results[2]['MARLENA']).mean())

  # coverage
  coverage_all['MTR'].append(results[3][0])
  coverage_all['GS'].append(results[3][1])
  coverage_all['LS'].append(results[3][2])
  coverage_all['MARLENA'].append(results[3][3])

  # estimators
  avgEstimators_all['reduced'].append(results[4])
  avgEstimators_all['original'].append(results[5])

print('MTR|', "mae: ", np.array(maeResults_all['MTR']).mean(axis=0).round(3),     "| ruleL:", np.array(avgRuleLengths_all['MTR']).mean(),     "| TIME:", np.array(time_response_all['MTR']).mean(),     "| Coverage:",np.array(coverage_all['MTR']).mean(), "| avg estimators:", round(np.array(avgEstimators_all['reduced']).mean(),3),"/",np.array(avgEstimators_all['original']).mean())
print(' GS|', "mae: ", np.array(maeResults_all['GS']).mean(axis=0).round(3),      "| ruleL:", np.array(avgRuleLengths_all['GS']).mean(),      "| TIME:", np.array(time_response_all['GS']).mean(),      "| Coverage:",np.array(coverage_all['GS']).mean())
print(' LS|', "mae: ", np.array(maeResults_all['LS']).mean(axis=0).round(3),      "| ruleL:", np.array(avgRuleLengths_all['LS']).mean(),      "| TIME:", np.array(time_response_all['LS']).mean(),      "| Coverage:",np.array(coverage_all['LS']).mean())
print('MAR|', "mae: ", np.array(maeResults_all['MARLENA']).mean(axis=0).round(3), "| ruleL:", np.array(avgRuleLengths_all['MARLENA']).mean(), "| TIME:", np.array(time_response_all['MARLENA']).mean(), "| Coverage:",np.array(coverage_all['MARLENA']).mean())


iteration: 1
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 7 tests
    2 / 7 tests
    3 / 7 tests
    4 / 7 tests
    5 / 7 tests
    6 / 7 tests
    7 / 7 tests
iteration: 2
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 7 tests
    2 / 7 tests
    3 / 7 tests
    4 / 7 tests
    5 / 7 tests
    6 / 7 tests
    7 / 7 tests
iteration: 3
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 7 tests
    2 / 7 tests
    3 / 7 tests
    4 / 7 tests
    5 / 7 tests
    6 / 7 tests
    7 / 7 tests
iteration: 4
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 7 tests
    2 / 7 tests
    3 / 7 tests
    4 / 7 tests
    5 / 7 tests
    6 / 7 tests
    7 / 7 tests
iteration: 5
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 7 tests
    2 / 7 tests
    3 / 7 tests
    4 / 7 tests
    5 / 7 tests
    6 / 7 tests
 

In [60]:
def getCsv(name, maeResults_all, t_n):
  df = pd.DataFrame()
  df['targets'] = list(t_n)
  for key in maeResults_all.keys():
    d=[]
    [d.append(str(round(x, 5))) for x in np.array(maeResults_all[key]).mean(axis=0)]
    print(d)
    df[key] = [str(x) for x in d]
  df.to_csv(name+'.csv', index=False)

In [70]:
with open('students.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    columns = ['method']
    [columns.append(x) for x in t_n]
    writer.writerow(columns)
    for key in maeResults_all.keys():
      d=[key]
      [d.append(round(x, 5)) for x in np.array(maeResults_all[key]).mean(axis=0)]
      writer.writerow(d)

In [61]:
getCsv('mae', maeResults_all, t_n)

['0.26131', '0.24489', '0.19292']
['0.0934', '0.10087', '0.06185']
['0.09749', '0.10911', '0.0548']
['0.05124', '0.05787', '0.04744']


In [41]:
print(np.array(maeResults_all['MTR']).mean(axis=0))
print(np.array(maeResults_all['GS']).mean(axis=0))
print(np.array(maeResults_all['LS']).mean(axis=0))
print(np.array(maeResults_all['MARLENA']).mean(axis=0))

[0.26131443 0.24488955 0.19292454]
[0.09340469 0.10086946 0.06185448]
[0.0974892  0.10911252 0.05479754]
[0.05123746 0.05787344 0.04743976]


# slump

In [None]:
# load data
slump_data = arff.loadarff('slump.arff')
slump_df = pd.DataFrame(slump_data[0])

# get column names
column_names = slump_df.columns

# get data/target names
f_n = column_names[:7]
t_n = column_names[7:]

X = slump_df[f_n]
y = slump_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

# scale target values
scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# andro

In [None]:
############
# andro data 49x30, 6 targets
############

# load data
andro_data = arff.loadarff('andro.arff')
andro_df = pd.DataFrame(andro_data[0])

# get column names
column_names = andro_df.columns

# get data/target names
f_n = column_names[:30]
t_n = column_names[30:]

X = andro_df[f_n]
y = andro_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# kf = KFold(n_splits=2, shuffle=True, random_state=42)
# counter = 1
# for train_index, test_index in kf.split(X):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   print("iteration:", counter)
#   counter += 1
#   doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.1)
#   break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 25 tests
2 / 25 tests
3 / 25 tests
4 / 25 tests
5 / 25 tests
6 / 25 tests
7 / 25 tests
8 / 25 tests
9 / 25 tests
10 / 25 tests
11 / 25 tests
12 / 25 tests
13 / 25 tests
14 / 25 tests
15 / 25 tests
16 / 25 tests
17 / 25 tests
18 / 25 tests
19 / 25 tests
20 / 25 tests
21 / 25 tests
22 / 25 tests
23 / 25 tests
24 / 25 tests
25 / 25 tests
MTR| mae:  [0. 0. 0. 0. 0. 0.] | ruleL: 30.0 | TIME: 0.12080965042114258 | Coverage: 0.0192 | avg estimators: 100.0 / 100
 GS| mae:  [0.059 0.055 0.057 0.059 0.058 0.055] | ruleL: 3.88 | TIME: 0.0005713367462158203 | Coverage: 0.20960000000000004
 LS| mae:  [0.061 0.066 0.054 0.057 0.057 0.055] | ruleL: 3.8 | TIME: 2.6210045337677004 | Coverage: 0.20160000000000008
MAR| mae:  [0.072 0.075 0.102 0.102 0.061 0.057] | ruleL: 5.0 | TIME: 0.09715690612792968 | Coverage: 0.08800000000000002


# osales NaN values

In [None]:
# ############
# # osales 639x413, 12 targets
# ############

# load data
osales_data = arff.loadarff('osales.arff')
osales_df = pd.DataFrame(osales_data[0])
osales_df=osales_df.dropna()

# get column names
column_names = osales_df.columns

# get data/target names
f_n = column_names[:30]
t_n = column_names[30:]

X = osales_df[f_n]
y = osales_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

kf = KFold(n_splits=2, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 100)
  break

# wq

In [None]:
############
# wq 1060x16, 14 targets
############

# load data
wq_data = arff.loadarff('wq.arff')
wq_df = pd.DataFrame(wq_data[0])

# get column names
column_names = wq_df.columns

# get data/target names
f_n = column_names[:5]
t_n = column_names[20:]

X = wq_df[f_n]
y = wq_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# kf = KFold(n_splits=30, shuffle=True, random_state=42)
# counter = 1
# for train_index, test_index in kf.split(X):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   print("iteration:", counter)
#   counter += 1
#   doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.1)
#   break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 36 tests
2 / 36 tests
3 / 36 tests
4 / 36 tests
5 / 36 tests
6 / 36 tests
7 / 36 tests
8 / 36 tests
9 / 36 tests
10 / 36 tests
11 / 36 tests
12 / 36 tests
13 / 36 tests
14 / 36 tests
15 / 36 tests
16 / 36 tests
17 / 36 tests
18 / 36 tests
19 / 36 tests
20 / 36 tests
21 / 36 tests
22 / 36 tests
23 / 36 tests
24 / 36 tests
25 / 36 tests
26 / 36 tests
27 / 36 tests
28 / 36 tests
29 / 36 tests
30 / 36 tests
31 / 36 tests
32 / 36 tests
33 / 36 tests
34 / 36 tests
35 / 36 tests
36 / 36 tests
MTR| mae:  [0.004 0.007 0.004 0.005 0.008 0.007 0.006 0.004 0.007 0.008] | ruleL: 5.0 | TIME: 0.14439587460623848 | Coverage: 0.027006172839506182 | avg estimators: 98.972 / 100
 GS| mae:  [0.03  0.053 0.018 0.034 0.081 0.051 0.035 0.027 0.043 0.039] | ruleL: 3.7222222222222223 | TIME: 0.0005907615025838217 | Coverage: 0.09259259259259256
 LS| mae:  [0.019 0.038 0.016 0.027 0.053 0.034 0.021 0.011 0.019 0.026] | ruleL: 2.5

# facebook

In [None]:
############
# facebook 500x14, 4 targets
############
fb_df = pd.read_csv('dataset_Facebook.csv', sep=';')#, nrows=200)

# fill NaN
fb_df['like'].fillna(0,inplace=True)
fb_df['share'].fillna(0,inplace=True)
fb_df['Paid'].fillna(0,inplace=True)
fb_df.drop(['Type'], inplace=True, axis=1)


# get column names
column_names = fb_df.columns

# get data/target names
f_n = column_names[:14]
#t_n = ['comment', 'share']
t_n = column_names[14:]

X = fb_df[f_n]
y = fb_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# kf = KFold(n_splits=25, shuffle=True, random_state=42)
# counter = 1
# for train_index, test_index in kf.split(X):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   print("iteration:", counter)
#   counter += 1
#   doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.1)
#   break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
20 / 20 tests
MTR| mae:  [0.004 0.004 0.004 0.004] | ruleL: 14.0 | TIME: 0.3586753010749817 | Coverage: 0.05000000000000001 | avg estimators: 99.6 / 100
 GS| mae:  [0.005 0.007 0.008 0.007] | ruleL: 4.2 | TIME: 0.0005957722663879394 | Coverage: 0.14500000000000005
 LS| mae:  [0.005 0.008 0.006 0.007] | ruleL: 2.35 | TIME: 2.575191414356232 | Coverage: 0.32999999999999996
MAR| mae:  [0.004 0.005 0.005 0.005] | ruleL: 6.2 | TIME: 0.17716871500015258 | Coverage: 0.07500000000000001


# River flow

In [None]:
import pandas as pd
# 9125x576 + 8
RF2_df = pd.read_csv('RF2.csv')
RF2_df = RF2_df.dropna(axis=0)

RF2_cols = RF2_df.columns
RF2_df = RF2_df[RF2_cols[1:]].reset_index(drop=True)
RF2_df = RF2_df.iloc[:200]

RF2_cols = RF2_df.columns
f_n = RF2_cols[:576]
t_n = RF2_cols[576:]
f_n = f_n[:15]
t_n = t_n[:5]

X = RF2_df[f_n]
y = RF2_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# counter = 1
# for train_index, test_index in kf.split(X):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   print("iteration:", counter)
#   counter += 1
#   doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.08)
#   break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
20 / 20 tests
MTR| mae:  [0.021 0.019 0.017 0.018 0.021] | ruleL: 15.0 | TIME: 0.13760486841201783 | Coverage: 0.05000000000000001 | avg estimators: 97.3 / 100
 GS| mae:  [0.02  0.04  0.027 0.016 0.031] | ruleL: 4.55 | TIME: 0.0005395770072937012 | Coverage: 0.09000000000000002
 LS| mae:  [0.014 0.04  0.016 0.012 0.016] | ruleL: 3.75 | TIME: 2.6330909371376037 | Coverage: 0.07250000000000004
MAR| mae:  [0.01  0.029 0.016 0.009 0.015] | ruleL: 5.8 | TIME: 0.123377525806427 | Coverage: 0.06500000000000002


In [None]:
# AVG OF K-FOLD
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/andro.arff
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/wq.arff
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/osales.arff

# test values

In [None]:
actualpreds

array([[24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82]])

In [None]:
MTRpreds

array([[[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]]])

In [None]:
new_array = np.array([subarray[:,1] for subarray in MTRpreds])
column_averages = np.mean(new_array, axis=0)
column_averages

array([2.685 , 4.645 , 3.5429])

In [None]:
print(actualpreds[0])
print(MTRpreds[0])
print(GSpreds[0])
print(LSpreds[0])

[24.   60.   45.82]
[[13.5575  2.685 ]
 [37.44    4.645 ]
 [45.8797  3.5429]]
[16.68    42.815   38.35685]
[12.38875 33.85    52.72455]


In [None]:
print(calc_al_error(y_test[10], 0.1))

[2.0500000000000003, 4.82, 4.93]


In [None]:
# experiments
# mae, len, cov, time
# k-fold
# allowed_error []
#https://github.com/intelligence-csd-auth-gr/LionLearn/blob/master/LionForests_Multi/experiments/C3.%20WaterQuality.ipynb

# OLD

using MTR

In [None]:
MTR_obj = MTR(model=None, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
rule = MTR_obj.explain(instance, 5) # you can add as last arguement the allowed error
featureLimits = MTR_obj.getFeatureLimits()
decisionsAndErrors = MTR_obj.getDecisionsAndErros()
print(decisionsAndErrors)
print(rule)

# this model will be used for L/G surrogate
model = MTR_obj.getModel()
predictions = model.predict(X_train)

allowed_error [5 5 5]
reduced_rules:  89 / 100
[[13.5575, 2.685], [37.44, 4.645], [45.8797, 3.5429]]
if 167.0<=Water<=168.5 & 904.0<=Coarse_Aggr<=917.5 & 0.0<=Slag<=0.05 & 801.5<=Fine_Aggr<=805.0 & 309.5<=Cemment<=310.0 & 142.5<=Fly_ash<=143.0 & 9.5<=SP<=10.0 then SLUMP_cm: 13.5575 +/- 2.685 error, FLOW_cm: 37.44 +/- 4.645 error, Compressive_Strength_Mpa: 45.8797 +/- 3.5429 error


using global surrogate

In [None]:
GS = GlobalSurrogateTree(X_train, predictions, f_n)
r, GSp = GS.rule(instance)
print(GS.rule(instance))

({'Water': [['<=', 182.25]], 'Slag': [['<=', 66.39999961853027]], 'Coarse_Aggr': [['<=', 1048.2999877929688], ['>', 904.0]], 'Fly_ash': [['<=', 210.9499969482422]]}, array([16.68   , 42.815  , 38.35685]))


using local surrogate

In [None]:
LS = LocalSurrogateTree(X_train, predictions, f_n, 40) # neigns should be >= 10
rl, LSp = LS.rule(instance)
print(LS.rule(instance))

({'Fly_ash': [['<=', 210.7123150630344], ['>', 34.602104331589345]], 'Water': [['<=', 181.24161005543618]], 'Cemment': [['>', 298.04265509501226]]}, array([12.38875, 33.85   , 52.72455]))
