In [None]:
!unzip Marlena.zip

In [2]:
from XMTR import MTR
from GlobalLocalVariants import GlobalSurrogateTree, LocalSurrogateTree
from Marlena.algorithms.MARLENA.marlena.marlena.marlena import MARLENA
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
import warnings
warnings.simplefilter(action='ignore')
import time
import csv

def calc_al_error(instance, perc):
  # the error should be non zero
  return (abs(instance)+0.1)*perc 


def rule_cov(instance, feature_names, rule):
  covered = True
  for k in range(len(instance)):
    feature = feature_names[k]
    if feature in rule.keys():
      if type(rule[feature][0]) == list: # for GS/LS
        for lst in rule[feature]:
          if lst[0] == '>' and instance[k] <= lst[1]:
            return 0
          if lst[0] == '<=' and instance[k] > lst[1]:
            return 0
      else: # if it comes from MTR
          if instance[k] > rule[feature][1]:  # 1=max
              return 0
          if instance[k] < rule[feature][0]:  # 0=min
              return 0
  return 1


def calcMae(actualPred, MTRpred, GSpred, LSpred, MARLENApreds):
  # mae MTR local error
  MTRerrors = np.array([subarray[:,1] for subarray in MTRpred])
  #print(MTRpred)
  MTRpreds = np.array([subarray[:,0] for subarray in MTRpred])
  column_errors = np.mean(MTRerrors, axis=0)
  maeActual_with_error = np.mean(MTRerrors, axis=0)

  # mae MTR/GS without local error
  maeGS = mean_absolute_error(GSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/LS without local error
  maeLS = mean_absolute_error(LSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/MARLENA without local error
  maeMAR = mean_absolute_error(MARLENApreds, MTRpreds, multioutput="raw_values")

  return [maeActual_with_error, maeGS, maeLS, maeMAR]

def doTest(X_train, X_test, y_train, y_test, f_n, t_n, percentage): 
  # train models
  print('   training MTR...') 
  RFmodel = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=10, max_features=None, n_jobs=-1, random_state=42)
  RFmodel.fit(X_train, y_train)
  MTR_obj = MTR(model=RFmodel, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
  model = MTR_obj.getModel()
  predictions = model.predict(X_train)

  print('   training GS...') 
  GS = GlobalSurrogateTree(X_train, predictions, f_n)
  print('   training LS...') 
  LS = LocalSurrogateTree(X_train, predictions, f_n, 20) # neigns should be >= 10
  print('   training Marlena...')
  marlena = MARLENA(neigh_type='mixed', random_state=42)


  actualpreds = []
  MTRpreds = []
  GSpreds = []
  LSpreds = []
  MARLENApreds = []

  time_response = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
  avgEstimators = []
  coverage = np.array([0,0,0,0])
  avgRuleLengths = np.array([0,0,0,0])
  for i in range(len(X_test)):
    print("   ",i+1,"/", len(X_test), "tests")
    inside_coverage = np.array([0,0,0,0])
    instance = X_test[i]
    # actual
    actualpreds.append(y_test[i])

    # MTR
    error = calc_al_error(y_test[i], percentage)
    ts = time.time()
    MTRrule = MTR_obj.explain(instance, error) # explain instance
    te = time.time() - ts
    time_response['MTR'].append(te)
    estimators = MTR_obj.getAmountOfReduction() # get estimators
    avgEstimators.append(estimators[0])
    decisionsAndErrors = MTR_obj.getDecisionsAndErros() # get preds/errors
    MTRpreds.append(decisionsAndErrors)
    feature_limits = MTR_obj.getFeatureLimits()
    avgRuleLengths[0] += len(feature_limits.keys())

    # GS 
    ts = time.time()
    GSrule, GSprediction = GS.rule(instance)
    te = time.time() - ts
    time_response['GS'].append(te)
    GSpreds.append(GSprediction)
    avgRuleLengths[1] += len(GSrule.keys())

    # LS
    ts = time.time()
    LSrule, LSprediction = LS.rule(instance)
    te = time.time() - ts
    time_response['LS'].append(te)
    LSpreds.append(LSprediction)
    avgRuleLengths[2] += len(LSrule.keys())

    # MARLENA
    i2e = pd.Series(instance, index=f_n)
    X2E = pd.DataFrame(X_train, columns=f_n)
    ts = time.time()
    # returns rule, mask(MarlenaPrediction), list_split_conditions, len_rule, instance_imporant_feat, fidelity, hit, DT
    MARrule, MarlenaPrediction, list_split_conditions, len_rule, _, _, _, _ = marlena.extract_explanation(i2e, X2E, model, f_n, [],
                                              t_n, k=10, size=50, alpha=0.7)
    te = time.time() - ts
    time_response['MARLENA'].append(te)
    MARLENApreds.append(MarlenaPrediction)
    avgRuleLengths[3] += len_rule #len(list_split_conditions.keys())

    # calculate the coverage
    for test_instance in X_test:
      MTRcov = rule_cov(test_instance, f_n, feature_limits)
      GScov = rule_cov(test_instance, f_n, GSrule) 
      LScov = rule_cov(test_instance, f_n, LSrule) 
      MARcov = rule_cov(test_instance, f_n, list_split_conditions) 
      inside_coverage[0] += MTRcov
      inside_coverage[1] += GScov
      inside_coverage[2] += LScov
      inside_coverage[3] += MARcov
    coverage = np.add(coverage, inside_coverage/len(X_test))

  actualpreds = np.array(actualpreds)
  MTRpreds = np.array(MTRpreds)
  GSpreds = np.array(GSpreds)
  LSpreds = np.array(LSpreds)
  MARLENApreds = np.array(MARLENApreds)

  coverage = coverage/len(X_test)

  avgRuleLengths = avgRuleLengths/len(X_test)
  maeResults = calcMae(actualpreds, MTRpreds, GSpreds, LSpreds, MARLENApreds)
  print("   =========")
  print("   Rules")
  print("   MTR: ", MTRrule)
  print("   MARLENA: ", MARrule)
  print("   =========")
  print('   MTR|', "mae: ", maeResults[0].round(5),"->", np.array(maeResults[0].round(3)).mean(), "| ruleL:", avgRuleLengths[0], "| TIME:", np.array(time_response['MTR']).mean(), "| Coverage:",coverage[0], "| avg estimators:", round(np.array(avgEstimators).mean(),3),"/",estimators[1])
  print('    GS|', "mae: ", maeResults[1].round(5),"->", np.array(maeResults[1].round(3)).mean(), "| ruleL:", avgRuleLengths[1], "| TIME:", np.array(time_response['GS']).mean(), "| Coverage:",coverage[1])
  print('    LS|', "mae: ", maeResults[2].round(5),"->", np.array(maeResults[2].round(3)).mean(), "| ruleL:", avgRuleLengths[2], "| TIME:", np.array(time_response['LS']).mean(), "| Coverage:",coverage[2])
  print('   MAR|', "mae: ", maeResults[3].round(5),"->", np.array(maeResults[3].round(3)).mean(), "| ruleL:", avgRuleLengths[3], "| TIME:", np.array(time_response['MARLENA']).mean(), "| Coverage:",coverage[3])
  return maeResults, avgRuleLengths, time_response, coverage, np.array(avgEstimators).mean(), estimators[1]

In [5]:
# 9125x576 + 8
RF2_df = pd.read_csv('RF2.csv')
RF2_df = RF2_df.dropna(axis=0)

RF2_cols = RF2_df.columns
RF2_df = RF2_df[RF2_cols[1:]].reset_index(drop=True)
#RF2_df = RF2_df.iloc[:200]

RF2_cols = RF2_df.columns
f_n = RF2_cols[:200]
t_n = RF2_cols[576:]
#f_n = f_n[:15]
#t_n = t_n[:5]

X = RF2_df[f_n]
y = RF2_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

# keep only 1k samples for faster run time
X, X_test, y, y_test = train_test_split(X, y, test_size=.9, random_state=42)

scaler = MinMaxScaler(feature_range=(1,10))
y = scaler.fit_transform(y)

allowed_error = 1.8

In [6]:
maeResults_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
avgRuleLengths_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
time_response_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
coverage_all = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
avgEstimators_all = {'reduced': [], 'original': []}

splits = 10
kf = KFold(n_splits=splits, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter, "/", splits)
  counter += 1
  results = doTest(X_train, X_test, y_train, y_test, f_n, t_n, allowed_error)

  # mae 
  maeResults_all['MTR'].append(results[0][0])
  maeResults_all['GS'].append(results[0][1])
  maeResults_all['LS'].append(results[0][2])
  maeResults_all['MARLENA'].append(results[0][3])

  # RL
  avgRuleLengths_all['MTR'].append(results[1][0])
  avgRuleLengths_all['GS'].append(results[1][1])
  avgRuleLengths_all['LS'].append(results[1][2])
  avgRuleLengths_all['MARLENA'].append(results[1][3])

  # time
  time_response_all['MTR'].append(np.array(results[2]['MTR']).mean())
  time_response_all['GS'].append(np.array(results[2]['GS']).mean())
  time_response_all['LS'].append(np.array(results[2]['LS']).mean())
  time_response_all['MARLENA'].append(np.array(results[2]['MARLENA']).mean())

  # coverage
  coverage_all['MTR'].append(results[3][0])
  coverage_all['GS'].append(results[3][1])
  coverage_all['LS'].append(results[3][2])
  coverage_all['MARLENA'].append(results[3][3])

  # estimators
  avgEstimators_all['reduced'].append(results[4])
  avgEstimators_all['original'].append(results[5])

print('MTR|', "mae: ", np.array(maeResults_all['MTR']).mean(axis=0).round(5),    "->", np.array(maeResults_all['MTR']).mean(axis=0).mean().round(5),    "| ruleL:", np.array(avgRuleLengths_all['MTR']).mean(),     "| TIME:", np.array(time_response_all['MTR']).mean(),     "| Coverage:",np.array(coverage_all['MTR']).mean(), "| avg estimators:", round(np.array(avgEstimators_all['reduced']).mean(),3),"/",np.array(avgEstimators_all['original']).mean())
print(' GS|', "mae: ", np.array(maeResults_all['GS']).mean(axis=0).round(5),     "->", np.array(maeResults_all['GS']).mean(axis=0).mean().round(5),     "| ruleL:", np.array(avgRuleLengths_all['GS']).mean(),      "| TIME:", np.array(time_response_all['GS']).mean(),      "| Coverage:",np.array(coverage_all['GS']).mean())
print(' LS|', "mae: ", np.array(maeResults_all['LS']).mean(axis=0).round(5),     "->", np.array(maeResults_all['LS']).mean(axis=0).mean().round(5),     "| ruleL:", np.array(avgRuleLengths_all['LS']).mean(),      "| TIME:", np.array(time_response_all['LS']).mean(),      "| Coverage:",np.array(coverage_all['LS']).mean())
print('MAR|', "mae: ", np.array(maeResults_all['MARLENA']).mean(axis=0).round(5),"->", np.array(maeResults_all['MARLENA']).mean(axis=0).mean().round(5), "| ruleL:", np.array(avgRuleLengths_all['MARLENA']).mean(), "| TIME:", np.array(time_response_all['MARLENA']).mean(), "| Coverage:",np.array(coverage_all['MARLENA']).mean())


iteration: 1 / 10
   training MTR...
   training GS...
   training LS...
   training Marlena...
    1 / 77 tests
    2 / 77 tests
    3 / 77 tests
    4 / 77 tests
    5 / 77 tests
    6 / 77 tests
    7 / 77 tests
    8 / 77 tests
    9 / 77 tests
    10 / 77 tests
    11 / 77 tests
    12 / 77 tests
    13 / 77 tests
    14 / 77 tests
    15 / 77 tests
    16 / 77 tests
    17 / 77 tests
    18 / 77 tests
    19 / 77 tests
    20 / 77 tests
    21 / 77 tests
    22 / 77 tests
    23 / 77 tests
    24 / 77 tests
    25 / 77 tests
    26 / 77 tests
    27 / 77 tests
    28 / 77 tests
    29 / 77 tests
    30 / 77 tests
    31 / 77 tests
    32 / 77 tests
    33 / 77 tests
    34 / 77 tests
    35 / 77 tests
    36 / 77 tests
    37 / 77 tests
    38 / 77 tests
    39 / 77 tests
    40 / 77 tests
    41 / 77 tests
    42 / 77 tests
    43 / 77 tests
    44 / 77 tests
    45 / 77 tests
    46 / 77 tests
    47 / 77 tests
    48 / 77 tests
    49 / 77 tests
    50 / 77 tests
    51 / 77 t