# Raw code

In [None]:
from sklearn.metrics import mean_absolute_error
import random
import statistics
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.ensemble import RandomForestRegressor
from mlxtend.preprocessing import TransactionEncoder
from sklearn.multioutput import MultiOutputRegressor
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


class MTR:
  def __init__(self, model=None, X_train=None, X_test=None, y_train=None, y_test=None, feature_names=None, target_names=None):  
    """Init function
        Args:
            model: The trained RF model
            trainData: the data that the RF was trained on
            feature_names: The names of the features from our dataset
            target_names: The names of the targets from our dataset
            mae: the mean absolute error of the trained RF model
            targets: the number of target values
        Attributes:
            model: The classifier/regression model
            trees: The trees of an trained ensemble system
            feature_names: The names of the features
            min_max_feature_values: A helping dictionary for the path/feature reduction process
            ranked_features: The features ranked based on SHAP Values (Small-Medium Datasets) or Feature Importance (Huge Datasets)
    """
    if X_train is None or X_test is None or y_train is None or y_test is None:
      print("non specified data")
    if feature_names is None:
      print("non specified features names")
    if target_names is None:
      print("non specified target names")

    self.trainData = X_train
    self.X_test = X_test
    self.y_train = y_train
    self.y_test = y_test

    if model is not None:
      self.model = model
    else:
      # add gridsearch
      parameters = [{
         'criterion': ['squared_error'],#, 'absolute_error'],
         #'max_depth': [2],#, 2, 5],   ----> if it does not extend fully, it may have an issue
         'max_features': ['sqrt'],#, 'log2', 0.75, None],
         'min_samples_leaf' : [1, 2, 5, 10]
      }]
      RF = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
      clf = GridSearchCV(estimator=RF, param_grid=parameters, cv=10, n_jobs=-1, verbose=0, scoring='neg_mean_absolute_error')
      clf.fit(self.trainData, self.y_train)
      RF = clf.best_estimator_
      self.model = RF
    self.allowed_error = 0
    self.amountOfReduction = None
    self.trees = self.model.estimators_  # model is never None
    self.predicted = self.model.predict(self.X_test)
    self.feature_names = feature_names
    self.target_names = target_names
    if target_names is not None:
      self.targets = len(target_names)
    self.silly_local_importance = {} # It will fill in only if AR reduction will be applied!
    self.min_max_feature_values = {}
    self.ranked_features = {}
    self.feature_rule_limits = {} # for testing
    self.decisions_and_erros = [] # for testing
    

  def getModel(self):
    if self.model is not None:
      return self.model
    print("you should define the model first")

  def getAllowedError(self):
    return self.allowed_error

  # for testing
  def getFeatureLimits(self):
    return self.feature_rule_limits

  # for testing
  def getDecisionsAndErros(self):
    return self.decisions_and_erros

  def getAmountOfReduction(self):
    return self.amountOfReduction

  def fitError(self, allowed_error=None):
    # if error is int, create array of 1xself.targets with the same error
    # if error is list of len=1, the final errors on the rule will average less than the error
    # if error is list with len=self.targets, the final errors on the rule will be less than the respective error
    if allowed_error is not None:
      if type(allowed_error) == int:
        self.allowed_error = np.array([allowed_error] * self.targets)
      else:
         self.allowed_error = np.array(allowed_error)
    else:
      self.allowed_error = mean_absolute_error(self.predicted, self.y_test, multioutput="raw_values")

  def explain(self, instance, allowed_error=None):
    # fit the model
    self.fitError(allowed_error)

    rules, predictions = self.label_paths(instance) # ranges=rules

    # find min/max of all leaves per tree
    minmax = self.find_regression_trees_min_maxes(self.feature_names)

    # reduce the rules
    reduced_rules, reduced_probabilities, local_error = self._reduce_through_association_rules(rules, predictions)
    self.amountOfReduction = [len(reduced_rules), len(rules)]

    # compose the final rule
    return self.composeRule(instance, reduced_rules, local_error)



  def label_paths(self, instance):
    """label_paths function finds the ranges and predictions for each label
    Args:
        instance: The instance we want to find the paths
    Return:
        a list which contains a dictionary with features as keys and their min max ranges as values per tree
        and a list with the predictions of each tree for the examined instance
    """
    ranges = []
    predictions = []
    for tree in self.trees:
      n_tree_prediction = []
      for np in tree.predict([instance]):
        n_tree_prediction.append(np) 
      tree_prediction = n_tree_prediction
      path = tree.decision_path([instance])
      leq = {}  # leq: less equal ex: x <= 1
      b = {}  # b: bigger ex: x > 0.6
      local_range = {}
      for node in path.indices:
        feature_id = tree.tree_.feature[node]
        feature = self.feature_names[feature_id]
        threshold = tree.tree_.threshold[node]
        if threshold != -2.0:
          if instance[feature_id] <= threshold:
            leq.setdefault(feature, []).append(threshold)
          else:
            b.setdefault(feature, []).append(threshold)
      for k in leq:
        local_range.setdefault(k, []).append(['<=', min(leq[k])])  # !!
      for k in b:
        local_range.setdefault(k, []).append(['>', max(b[k])])  # !!
      ranges.append(local_range)
      predictions.append(list(tree_prediction[0]))
    return ranges, predictions


  def tree_to_code(self, tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i] for i in tree_.feature]
    leaf_nodes = []
    def recurse(node, depth):
      indent = "  " * depth
      if tree_.feature[node] != -2:
        name = feature_name[node]
        threshold = tree_.threshold[node]
        temp = []
        [temp.append(t) for t in recurse(tree_.children_left[node], depth + 1)]
        [temp.append(t) for t in recurse(tree_.children_right[node], depth + 1)]
        return temp
      else:
          return([node])

    leaf_nodes.append(recurse(0, 1))
    return(leaf_nodes[0])


  # we want ths to find the minmax of excluded trees
  def find_regression_trees_min_maxes(self, feature_names):
    """ finds min max of the leaves for each tree
    Args:
        trees: list of estimators, the examined trees
        num_of_targets: the number of the targets in the regression
    Return:
        A dict that contains a list of num_of_targets*2 values, first half are the min
        of each target, last half are maxes.
        example: [array([0.]), array([20.]), array([17.19]), | array([29.]), array([78.]), array([58.53])]
    """
    trees = self.trees
    min_max_leaf_prediction_per_tree = {}
    for i in range(len(trees)):
      tree = trees[i]
      min_max_leaf_prediction_per_tree[i] = [None for i in range(self.targets*2)]
      leaf_nodes = self.tree_to_code(tree, feature_names)
      for l in leaf_nodes:
        # here value returns the 3 targets per leaf and we want their minmax
        value = tree.tree_.value[l]
        for target in range(len(value)):
          if min_max_leaf_prediction_per_tree[i][target+len(value)] is None or value[target] > min_max_leaf_prediction_per_tree[i][target+len(value)]:
              min_max_leaf_prediction_per_tree[i][target+len(value)] = value[target]
          if  min_max_leaf_prediction_per_tree[i][target] is None or value[target] < min_max_leaf_prediction_per_tree[i][target]:
              min_max_leaf_prediction_per_tree[i][target] = value[target]
    self.min_max_leaf_prediction_per_tree = min_max_leaf_prediction_per_tree
    return min_max_leaf_prediction_per_tree



  # Algorithm 6
  #https://github.com/intelligence-csd-auth-gr/LionLearn/blob/1931ed50a2ca47f80243ce31e258d3f5fa9e701f/LionForests/lionforests.py#L1022
  def _reduce_through_distribution_multi(self, instance, rules, predictions, instance_qe, method, targets):
    """ path reduction
    Args:
        instance: used for the random seed
        rules: we get them from func label_paths
        predictions: predictions per tree, for the given target. we get them from func label_paths
        instance_qe: allowed error
        method: R2 for inner and R3 for outter
        target: a list with index of targets
    Return:
        the reduced rules and reduced predictions for the predifined targets
    """
    # the final results
    local_error_per_target = []
    reduced_rules_per_target = []
    reduced_predictions_per_target = []

    # loop for each selected target
    for target in targets:
      reduced_rules = rules
      reduced_predictions = predictions[:,target]
      real_prediction = np.array(predictions[:,target]).mean()
      min_errors = abs(instance_qe)
      min_s = 0

      for s in [.1, .2, .5, 1, 2, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100]:
        np.random.seed(42)
        normal_dis = np.random.normal(real_prediction, np.array(predictions[:,target]).std()/s, 100)
        c = 0
        l_reduced_rules = []
        l_reduced_predictions = []


        # for each tree basically
        for i in predictions[:,target]:
          # R2 is inner and R3 is outter
          if (method == 'R2' and not (i < normal_dis.min() or i > normal_dis.max())) or (method == 'R3' and (i < normal_dis.min() or i > normal_dis.max())):
              l_reduced_rules.append(rules[c])
              l_reduced_predictions.append(predictions[:,target][c])
          else:
              # the min
              dis_a = abs(i - self.min_max_leaf_prediction_per_tree[c][target][0])
              # the max
              dis_b = abs(i - self.min_max_leaf_prediction_per_tree[c][target+self.targets][0])

              if dis_a < dis_b:
                  l_reduced_predictions.append(self.min_max_leaf_prediction_per_tree[c][target+self.targets][0])
              else:
                  l_reduced_predictions.append(self.min_max_leaf_prediction_per_tree[c][target][0])
          c = c + 1

        l_error = abs(np.array(l_reduced_predictions).mean() - real_prediction)
        if l_error < abs(instance_qe) and len(l_reduced_rules) < len(reduced_rules):
            reduced_rules = l_reduced_rules
            reduced_predictions = l_reduced_predictions
            min_errors = l_error
            min_s = s

      local_error_per_target.append(l_error)
      reduced_rules_per_target.append(reduced_rules)
      reduced_predictions_per_target.append(reduced_predictions)
    return local_error_per_target, reduced_rules_per_target, reduced_predictions_per_target


  #https://github.com/intelligence-csd-auth-gr/LionLearn/blob/1931ed50a2ca47f80243ce31e258d3f5fa9e701f/LionForests/lionforests.py#L715
  def _reduce_through_association_rules(self, rules, probabilities):
    """ path reduction
    Args:
        rules: we get them from func label_paths
        probabilities: predictions per tree, for the given target. we get them from func label_paths
    Return:
        reduced rules and the reduced predictions
    """
    reduced_rules = rules
    reduced_probabilities = probabilities

    # create itemsets of features per rule create from estimators
    get_itemsets = []
    items = set()
    # for each rule of the estimators
    for rule in rules:
      itemset = []
      # for each feature of the rule
      for p in rule:
        itemset.append(p)
        items.add(p)
      get_itemsets.append(itemset)
    max_number_of_features = len(items) # all distinct features
    del items

    # one-hot transform into boolean array and put in df
    tEncoder = TransactionEncoder()
    oneHotItemset = tEncoder.fit(get_itemsets).transform(get_itemsets)
    df = pd.DataFrame(oneHotItemset, columns=tEncoder.columns_)
    
    # run association rules and get frequent itemsets (fi) (ADD fpgrowth)
    temp_fi = apriori(df, min_support=0.1, use_colnames=True)
    if len(temp_fi.values) == 0:
        return rules, probabilities

    # get the frequent itemsets 
    frequent_itemsets = (
      association_rules(temp_fi, metric="support", min_threshold=0.1)
      .sort_values(by="confidence",ascending=True)
    )
    
    # Collect features and their importance from the association rules
    probability = 0
    k = 1
    antecedents = []
    antecedents_weights = {}
    antecedents_set = set()
    wcounter = 0
    for antecedent in list(frequent_itemsets['antecedents']):
      if tuple(antecedent) not in antecedents_set:
        antecedents_set.add(tuple(antecedent))
        for antecedent_i in list(antecedent):
          if antecedent_i not in antecedents:
            antecedents.append(antecedent_i)
      for antecedent_i in list(antecedent):
        wcounter = wcounter + 1
        if antecedent_i not in antecedents_weights:
          antecedents_weights[antecedent_i] = 1/wcounter
        else:
          antecedents_weights[antecedent_i] = antecedents_weights[antecedent_i] + 1/wcounter
    self.silly_local_importance = antecedents_weights # dict {feature: importance}
    size_of_ar = len(antecedents)

    
    items = set() # may be redundant since it was calculated/deleted previously
    new_feature_list = []
    for pr in reduced_rules:
      for p in pr:
        items.add(p)
    new_feature_list = list(items)

    reduced_rules = []
    #reduced_probabilities = []
    local_error = 2 * abs(self.allowed_error)
    keep_pids = []

    while np.any(local_error > abs(self.allowed_error)) and k <= size_of_ar:
      feature_set = set()
      for i in range(0, k):
        feature_set.add(antecedents[i])

      new_feature_list = list(feature_set)
      redundant_features = [
          i for i in self.feature_names if i not in new_feature_list]
      reduced_rules = []
      pid = 0
      keep_pids = []
      reduced_probabilities = []
      
      # for each rule, for each target
      for rule in rules:
        reduced_probabilities_per_target = []
        flag = True
        for target in range(self.targets):
          # in case of no redundant features in the rule
          if sum([1 for j in redundant_features if j in rule]) == 0:
            if flag: # this so the rule is added once per target
              reduced_rules.append(rule) # will get this using keep_pids
              flag=False
            reduced_probabilities_per_target.append(probabilities[pid][target])
            keep_pids.append(pid)
          else:
            dis_a = abs(probabilities[pid][target]- self.min_max_leaf_prediction_per_tree[pid][target][0])
            dis_b = abs(probabilities[pid][target] - self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            if dis_a < dis_b:
              reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            else:
              reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target][0])
        pid = pid + 1
        reduced_probabilities.append(reduced_probabilities_per_target)
      local_error = mean_absolute_error(probabilities, reduced_probabilities, multioutput="raw_values")
      k += 1

    # reset
    if np.any(local_error > abs(self.allowed_error)) and k > size_of_ar:  
      keep_pids = []
      reduced_rules = []
      reduced_probabilities = []
      pid = 0
      for i in rules:
        reduced_rules.append(i)
        reduced_probabilities.append(probabilities[pid])
        keep_pids.append(pid)
        pid = pid + 1
    temp_pids = keep_pids.copy()
    last_pid = None

    while np.all(local_error < abs(self.allowed_error)) and len(temp_pids) > 2:
      reduced_rules = []
      pid = 0
      reduced_probabilities = []
      last_pid = temp_pids[-1]
      temp_pids = temp_pids[:-1]

      for rule in rules:
        reduced_probabilities_per_target = []
        if pid in temp_pids:
          reduced_rules.append(rule)
          for target in range(self.targets): 
            reduced_probabilities_per_target.append(probabilities[pid][target])
        else:
          for target in range(self.targets):
            dis_a = abs(probabilities[pid][target] - self.min_max_leaf_prediction_per_tree[pid][target][0])
            dis_b = abs(probabilities[pid][target] - self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            if dis_a < dis_b:
                reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            else:
                reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target][0])
        pid = pid + 1
        reduced_probabilities.append(reduced_probabilities_per_target)

      local_error = mean_absolute_error(probabilities, reduced_probabilities, multioutput="raw_values")

    if last_pid is not None:
      temp_pids.append(last_pid)
      reduced_rules = []
      pid = 0
      reduced_probabilities = []

      for rule in rules:
        reduced_probabilities_per_target = []
        if pid in temp_pids:
          reduced_rules.append(rule)
          for target in range(self.targets): 
            reduced_probabilities_per_target.append(probabilities[pid][target])
        else:
          for target in range(self.targets):
            dis_a = abs(probabilities[pid][target] - self.min_max_leaf_prediction_per_tree[pid][target][0])
            dis_b = abs(probabilities[pid][target] - self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            if dis_a < dis_b:
                reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target+self.targets][0])
            else:
                reduced_probabilities_per_target.append(self.min_max_leaf_prediction_per_tree[pid][target][0])
        pid = pid + 1
        reduced_probabilities.append(reduced_probabilities_per_target)

      local_error = mean_absolute_error(probabilities, reduced_probabilities, multioutput="raw_values")
    local_error = mean_absolute_error(probabilities, reduced_probabilities, multioutput="raw_values")
    return reduced_rules, reduced_probabilities, local_error


  def _pre_feature_range_caluclation(self, rules, feature):
    ''' function that return the min and max values that a feature from the 
        reduced rules can get
      args:
        rules: the rules we got after the reduction
        features: a particular feature out of the reduced feature set
    '''
    for i in range(len(self.feature_names)):
      self.min_max_feature_values[self.feature_names[i]] = [min(self.trainData[:, i]), max(self.trainData[:, i])]

    mi = None
    ma = None
    for i in rules:
      if feature in i:
        if len(i[feature]) == 1:
          if i[feature][0][0] == "<=":
            if ma is None or ma >= i[feature][0][1]:
              ma = i[feature][0][1]
          else:
            if mi == None or mi <= i[feature][0][1]:
              mi = i[feature][0][1]
        else:
          if mi == None or mi <= i[feature][1][1]:
            mi = i[feature][1][1]
          if ma == None or ma >= i[feature][0][1]:
            ma = i[feature][0][1]
    if mi is None:
      mi = self.min_max_feature_values[feature][0]
    if ma is None:
      ma = self.min_max_feature_values[feature][1]
    return [mi, ma]


  # https://github.com/intelligence-csd-auth-gr/LionLearn/blob/1931ed50a2ca47f80243ce31e258d3f5fa9e701f/LionForests/lionforests.py#L522
  def composeRule(self, instance, reduced_rules, local_error):
    ''' function used to compose the final rule
    '''
    rule = "if "
    temp_f_mins = {}
    temp_f_maxs = {}
    self.feature_rule_limits = {}
    self.decisions_and_erros = []

    # get the features that appear on the reduced rules
    items = set()
    for r in reduced_rules:
      for feature in r:
        items.add(feature)
    local_feature_names = list(items)


    for feature in self.feature_names:
      if feature in local_feature_names:
        mi, ma = self._pre_feature_range_caluclation(reduced_rules, feature)
        temp_f_mins[feature] = mi
        temp_f_maxs[feature] = ma

    f_mins = []
    f_maxs = []
    for feature in self.feature_names:
      if feature in temp_f_mins:
        f_mins.append(temp_f_mins[feature])
      else:
        f_mins.append(0)
      if feature in temp_f_maxs:
        f_maxs.append(temp_f_maxs[feature])
      else:
        f_maxs.append(0)

    # create the decision for all target values
    decision = {}
    pred = self.model.predict([instance])[0]
    if local_error is not None:
      for tar in range(len(self.target_names)):
        decision[self.target_names[tar]] = self.target_names[tar] + ': ' + str(round(pred[tar], 4)) + " +/- " + str(round(local_error[tar], 4)) + " error"
        self.decisions_and_erros.append([pred[tar], local_error[tar]])
    else:
      for tar in range(len(self.target_names)):
        decision[self.target_names[tar]] = self.target_names[tar] + ': ' + str(round(pred[tar], 4))
        self.decisions_and_erros.append([pred[tar], 0])
    # we only use this for reference on the ranked features below, its the same for all targets
    target_name = self.target_names[0]


    d = {'Feature': self.feature_names,
          'Importance': self.model.feature_importances_}
    for ind in range(len(self.target_names)):
      self.ranked_features[self.target_names[ind]] = \
          pd.DataFrame(data=d).sort_values(
              by=['Importance'], ascending=False)['Feature'].values

    # create the rule containing mins and maxes of the reduced features
    for ranked_f in self.ranked_features[target_name]:
      f = self.feature_names.get_loc(ranked_f)
      if self.feature_names[f] in local_feature_names:
        mmi = np.array([f_mins, f_mins])[0][f]
        mma = np.array([f_maxs, f_maxs])[0][f]  # ena tab mesa
        self.feature_rule_limits[self.feature_names[f]] = [mmi, mma]
        rule = rule + str(round(mmi, 3)) + "<=" + self.feature_names[f] + "<=" + str(round(mma, 3)) + " & "

    rule = rule[:-3] + " then "
    for key in decision.keys():
      rule += decision[key] + ", "
    return rule[:-2]


# TESTING

In [None]:
!unzip Marlena.zip

In [5]:
from XMTR import MTR
from GlobalLocalVariants import GlobalSurrogateTree, LocalSurrogateTree
from Marlena.algorithms.MARLENA.marlena.marlena.marlena import MARLENA
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
import warnings
warnings.simplefilter(action='ignore')
import time

In [3]:
def calc_al_error(instance, perc):
  # the error should be non zero
  return (instance+0.1)*perc 


def rule_cov(instance, feature_names, rule):
  covered = True
  for k in range(len(instance)):
    feature = feature_names[k]
    if feature in rule.keys():
      if type(rule[feature][0]) == list: # for GS/LS
        for lst in rule[feature]:
          if lst[0] == '>' and instance[k] <= lst[1]:
            return 0
          if lst[0] == '<=' and instance[k] > lst[1]:
            return 0
      else: # if it comes from MTR
          if instance[k] > rule[feature][1]:  # 1=max
              return 0
          if instance[k] < rule[feature][0]:  # 0=min
              return 0
  return 1


def calcMae(actualPred, MTRpred, GSpred, LSpred, MARLENApreds):
  # mae MTR local error
  MTRerrors = np.array([subarray[:,1] for subarray in MTRpred])
  MTRpreds = np.array([subarray[:,0] for subarray in MTRpred])
  column_errors = np.mean(MTRerrors, axis=0)
  maeActual_with_error = np.mean(MTRerrors, axis=0)

  # mae MTR/GS without local error
  maeGS = mean_absolute_error(GSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/LS without local error
  maeLS = mean_absolute_error(LSpred, MTRpreds, multioutput="raw_values")

  # mae MTR/MARLENA without local error
  maeMAR = mean_absolute_error(MARLENApreds, MTRpreds, multioutput="raw_values")

  return [maeActual_with_error, maeGS, maeLS, maeMAR]

def doTest(X_train, X_test, y_train, y_test, f_n, t_n, percentage): 
  # train models
  print('training MTR...') 
  MTR_obj = MTR(model=None, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
  model = MTR_obj.getModel()
  predictions = model.predict(X_train)

  print('training GS...') 
  GS = GlobalSurrogateTree(X_train, predictions, f_n)
  print('training LS...') 
  LS = LocalSurrogateTree(X_train, predictions, f_n, 20) # neigns should be >= 10
  print('training Marlena...')
  marlena = MARLENA(neigh_type='mixed', random_state=42)


  actualpreds = []
  MTRpreds = []
  GSpreds = []
  LSpreds = []
  MARLENApreds = []

  time_response = {'MTR': [], 'GS': [], 'LS': [], 'MARLENA': []}
  avgEstimators = []
  coverage = np.array([0,0,0,0])
  avgRuleLengths = np.array([0,0,0,0])
  for i in range(len(X_test)):
    print(i+1,"/", len(X_test), "tests")
    inside_coverage = np.array([0,0,0,0])
    instance = X_test[i]
    # actual
    actualpreds.append(y_test[i])

    # MTR
    error = calc_al_error(y_test[i], percentage)
    ts = time.time()
    MTRrule = MTR_obj.explain(instance, error) # explain instance
    te = time.time() - ts
    time_response['MTR'].append(te)
    estimators = MTR_obj.getAmountOfReduction() # get estimators
    avgEstimators.append(estimators[0])
    decisionsAndErrors = MTR_obj.getDecisionsAndErros() # get preds/errors
    MTRpreds.append(decisionsAndErrors)
    feature_limits = MTR_obj.getFeatureLimits()
    avgRuleLengths[0] += len(feature_limits.keys())

    # GS 
    ts = time.time()
    GSrule, GSprediction = GS.rule(instance)
    te = time.time() - ts
    time_response['GS'].append(te)
    GSpreds.append(GSprediction)
    avgRuleLengths[1] += len(GSrule.keys())

    # LS
    ts = time.time()
    LSrule, LSprediction = LS.rule(instance)
    te = time.time() - ts
    time_response['LS'].append(te)
    LSpreds.append(LSprediction)
    avgRuleLengths[2] += len(LSrule.keys())

    # MARLENA
    i2e = pd.Series(instance, index=f_n)
    X2E = pd.DataFrame(X_train, columns=f_n)
    ts = time.time()
    # returns rule, mask(MarlenaPrediction), list_split_conditions, len_rule, instance_imporant_feat, fidelity, hit, DT
    _, MarlenaPrediction, list_split_conditions, len_rule, _, _, _, _ = marlena.extract_explanation(i2e, X2E, model, f_n, [],
                                              t_n, k=10, size=50, alpha=0.7)
    te = time.time() - ts
    time_response['MARLENA'].append(te)
    MARLENApreds.append(MarlenaPrediction)
    avgRuleLengths[3] += len_rule #len(list_split_conditions.keys())

    # calculate the coverage
    for test_instance in X_test:
      MTRcov = rule_cov(test_instance, f_n, feature_limits)
      GScov = rule_cov(test_instance, f_n, GSrule) 
      LScov = rule_cov(test_instance, f_n, LSrule) 
      MARcov = rule_cov(test_instance, f_n, list_split_conditions) 
      inside_coverage[0] += MTRcov
      inside_coverage[1] += GScov
      inside_coverage[2] += LScov
      inside_coverage[3] += MARcov
    coverage = np.add(coverage, inside_coverage/len(X_test))

  actualpreds = np.array(actualpreds)
  MTRpreds = np.array(MTRpreds)
  GSpreds = np.array(GSpreds)
  LSpreds = np.array(LSpreds)
  MARLENApreds = np.array(MARLENApreds)

  coverage = coverage/len(X_test)

  avgRuleLengths = avgRuleLengths/len(X_test)
  maeResults = calcMae(actualpreds, MTRpreds, GSpreds, LSpreds, MARLENApreds)

  print('MTR|', "mae: ", maeResults[0].round(3), "| ruleL:", avgRuleLengths[0], "| TIME:", np.array(time_response['MTR']).mean(), "| Coverage:",coverage[0], "| avg estimators:", round(np.array(avgEstimators).mean(),3),"/",estimators[1])
  print(' GS|', "mae: ", maeResults[1].round(3), "| ruleL:", avgRuleLengths[1], "| TIME:", np.array(time_response['GS']).mean(), "| Coverage:",coverage[1])
  print(' LS|', "mae: ", maeResults[2].round(3), "| ruleL:", avgRuleLengths[2], "| TIME:", np.array(time_response['LS']).mean(), "| Coverage:",coverage[2])
  print('MAR|', "mae: ", maeResults[3].round(3), "| ruleL:", avgRuleLengths[3], "| TIME:", np.array(time_response['MARLENA']).mean(), "| Coverage:",coverage[3])


# slump

In [6]:
############
# slump data 103x10, 3 of them targets
############

# load data
slump_data = arff.loadarff('slump.arff')
slump_df = pd.DataFrame(slump_data[0])

# get column names
column_names = slump_df.columns

# get data/target names
f_n = column_names[:7]
t_n = column_names[7:]

X = slump_df[f_n]
y = slump_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.1)
  break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 11 tests
2 / 11 tests
3 / 11 tests
4 / 11 tests
5 / 11 tests
6 / 11 tests
7 / 11 tests
8 / 11 tests
9 / 11 tests
10 / 11 tests
11 / 11 tests
MTR| mae:  [1.774 3.235 1.909] | ruleL: 7.0 | TIME: 0.2058915441686457 | Coverage: 0.08264462809917357 | avg estimators: 92.273 / 100
 GS| mae:  [3.49  7.423 3.532] | ruleL: 4.0 | TIME: 0.0004927678541703658 | Coverage: 0.2231404958677686
 LS| mae:  [2.525 7.35  2.532] | ruleL: 2.4545454545454546 | TIME: 2.3376534418626265 | Coverage: 0.2809917355371901
MAR| mae:  [2.161 5.714 1.988] | ruleL: 4.636363636363637 | TIME: 0.12489815191789107 | Coverage: 0.09090909090909093


# andro

In [None]:
############
# andro data 49x30, 6 targets
############

# load data
andro_data = arff.loadarff('andro.arff')
andro_df = pd.DataFrame(andro_data[0])

# get column names
column_names = andro_df.columns

# get data/target names
f_n = column_names[:30]
t_n = column_names[30:]

X = andro_df[f_n]
y = andro_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

kf = KFold(n_splits=2, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 100)
  break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 25 tests
2 / 25 tests
3 / 25 tests
4 / 25 tests
5 / 25 tests
6 / 25 tests
7 / 25 tests
8 / 25 tests
9 / 25 tests
10 / 25 tests
11 / 25 tests
12 / 25 tests
13 / 25 tests
14 / 25 tests
15 / 25 tests
16 / 25 tests
17 / 25 tests
18 / 25 tests
19 / 25 tests
20 / 25 tests
21 / 25 tests
22 / 25 tests
23 / 25 tests
24 / 25 tests
25 / 25 tests
MTR| mae:  [0. 0. 0. 0. 0. 0.] | ruleL: 28.88 | TIME: 0.1266466522216797 | Coverage: 0.0192 | avg estimators: 100.0 / 100
 GS| mae:  [0.26  0.052 0.195 0.136 1.874 0.133] | ruleL: 3.48 | TIME: 0.000635986328125 | Coverage: 0.17120000000000007
 LS| mae:  [0.282 0.054 0.223 0.166 2.245 0.156] | ruleL: 3.96 | TIME: 3.412086591720581 | Coverage: 0.16160000000000005
MAR| mae:  [0.3   0.064 0.253 0.185 2.725 0.177] | ruleL: 5.84 | TIME: 0.14638819694519042 | Coverage: 0.09920000000000004


# osales NaN values

In [None]:
# ############
# # osales 639x413, 12 targets
# ############

# # load data
# osales_data = arff.loadarff('osales.arff')
# osales_df = pd.DataFrame(osales_data[0])

# # get column names
# column_names = osales_df.columns

# # get data/target names
# f_n = column_names[:30]
# t_n = column_names[30:]

# X = osales_df[f_n]
# y = osales_df[t_n]

# # convert to numpy
# X = X.to_numpy()
# y = y.to_numpy()

# kf = KFold(n_splits=2, shuffle=True, random_state=42)
# counter = 1
# for train_index, test_index in kf.split(X):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   print("iteration:", counter)
#   counter += 1
#   doTest(X_train, X_test, y_train, y_test, f_n, t_n, 100)
#   break

# wq

In [None]:
############
# wq 1060x16, 14 targets
############

# load data
wq_data = arff.loadarff('wq.arff')
wq_df = pd.DataFrame(wq_data[0])

# get column names
column_names = wq_df.columns

# get data/target names
f_n = column_names[:5]
t_n = column_names[20:]

X = wq_df[f_n]
y = wq_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()


kf = KFold(n_splits=25, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 2)
  break

iteration: 1
training MTR...
training GS...
training LS...
training Marlena...
1 / 43 tests
local_error [0.10701305 0.18176407 0.08481727 0.11113636 0.16815476 0.12390693
 0.11658658 0.09795887 0.1397619  0.19051948]
[[0.7220494301450185, 0.10701305067481538], [1.755568076187659, 0.18176406926406927], [0.4749187347310414, 0.08481726606726607], [1.3075536992677086, 0.11113636363636363], [2.399164564580586, 0.1681547619047619], [0.9356882673569981, 0.12390692640692641], [0.9865672783099251, 0.1165865800865801], [0.20846280400885667, 0.09795887445887445], [0.7477743061032532, 0.13976190476190475], [1.0528673302961225, 0.19051948051948053]]
2 / 43 tests
local_error [0.09019444 0.15888889 0.07407051 0.11116667 0.18       0.12458333
 0.08890152 0.06938889 0.10954365 0.16693182]
[[0.7232313691210751, 0.09019444444444444], [1.490709826937768, 0.15888888888888889], [0.18308638093932214, 0.07407051282051282], [0.1939048631107454, 0.11116666666666668], [0.7976246857717447, 0.18], [1.3349654406051

# facebook

In [None]:
############
# facebook 500x18, 4 targets
############
fb_df = pd.read_csv('dataset_Facebook.csv', sep=';', nrows=200)

# fill NaN
fb_df['like'].fillna(0,inplace=True)
fb_df['share'].fillna(0,inplace=True)
fb_df['Paid'].fillna(0,inplace=True)
fb_df.drop(['Type'], inplace=True, axis=1)


# get column names
column_names = fb_df.columns

# get data/target names
f_n = column_names[:14]
t_n = ['comment', 'share']

X = fb_df[f_n]
y = fb_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.9)

iteration: 1
training MTR
training GS
training LS
0 / 20 tests
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
MTR| mae:  [2.335 4.724] | TIME: 0.148 | Coverage: 0.05000000000000001 | avg estimators: 92.4 / 100
 GS| mae:  [6.735 9.078] | TIME: 0.001 | Coverage: 0.11499999999999996
 LS| mae:  [6.242 8.927] | TIME: 3.178 | Coverage: 0.18749999999999997
iteration: 2
training MTR
training GS
training LS
0 / 20 tests
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
MTR| mae:  [1.796 4.197] | TIME: 0.14 | Coverage: 0.19249999999999995 | avg estimators: 78.85 / 100
 GS| mae:  [ 4.185

# River flow

In [None]:
import pandas as pd
# 9125x576 + 8
RF2_df = pd.read_csv('RF2.csv')
RF2_df = RF2_df.dropna(axis=0)

RF2_cols = RF2_df.columns
RF2_df = RF2_df[RF2_cols[1:]].reset_index(drop=True)
RF2_df = RF2_df.iloc[:200]

RF2_cols = RF2_df.columns
f_n = RF2_cols[:576]
t_n = RF2_cols[576:]
f_n = f_n[:15]
t_n = t_n[:5]

X = RF2_df[f_n]
y = RF2_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
counter = 1
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  print("iteration:", counter)
  counter += 1
  doTest(X_train, X_test, y_train, y_test, f_n, t_n, 0.08)

iteration: 1
training MTR
training GS
training LS
0 / 20 tests
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
MTR| mae:  [1.698 0.003 1.627 0.901 0.66 ] | TIME: 0.139 | Coverage: 0.05000000000000001 | avg estimators: 95.55 / 100
 GS| mae:  [1.273 0.017 1.351 0.616 1.223] | TIME: 0.001 | Coverage: 0.09500000000000001
 LS| mae:  [1.176 0.016 1.078 0.678 1.194] | TIME: 2.493 | Coverage: 0.10500000000000001
iteration: 2
training MTR
training GS
training LS
0 / 20 tests
1 / 20 tests
2 / 20 tests
3 / 20 tests
4 / 20 tests
5 / 20 tests
6 / 20 tests
7 / 20 tests
8 / 20 tests
9 / 20 tests
10 / 20 tests
11 / 20 tests
12 / 20 tests
13 / 20 tests
14 / 20 tests
15 / 20 tests
16 / 20 tests
17 / 20 tests
18 / 20 tests
19 / 20 tests
MTR| mae:  [0.866 0.001 0.871 0.456 0.302] | TIME: 0.14 | Cover

In [None]:
# AVG OF K-FOLD
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/andro.arff
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/wq.arff
#https://github.com/tsoumakas/mulan/blob/master/data/multi-target/osales.arff

In [None]:
from algorithms.MARLENA.marlena.marlena.marlena import MARLENA
# load data
slump_data = arff.loadarff('slump.arff')
slump_df = pd.DataFrame(slump_data[0])

# get column names
column_names = slump_df.columns

# get data/target names
f_n = column_names[:7]
t_n = column_names[7:]

X = slump_df[f_n]
y = slump_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Marlena fidelity to mae at the end
MTR_obj = MTR(model=None, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
model = MTR_obj.getModel()
#predictions = model.predict(X_train)

instance = X_test[0]
feature_names = f_n
label_names = t_n

m1 = MARLENA(neigh_type='mixed', random_state=42)
i2e = pd.Series(instance, index=feature_names)
X2E = pd.DataFrame(X_train, columns=feature_names)
# returns rule, mask, list_split_conditions, len_rule, instance_imporant_feat, fidelity, hit, DT
_, MarlenaPreds, list_split_conditions, len_rule, _, _, _, _ = m1.extract_explanation(i2e, X2E, model, feature_names, [],
                                          label_names, k=10, size=50, alpha=0.7)

# test values

In [None]:
actualpreds

array([[24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82],
       [24.  , 60.  , 45.82]])

In [None]:
MTRpreds

array([[[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]],

       [[13.5575,  2.685 ],
        [37.44  ,  4.645 ],
        [45.8797,  3.5429]]])

In [None]:
new_array = np.array([subarray[:,1] for subarray in MTRpreds])
column_averages = np.mean(new_array, axis=0)
column_averages

array([2.685 , 4.645 , 3.5429])

In [None]:
print(actualpreds[0])
print(MTRpreds[0])
print(GSpreds[0])
print(LSpreds[0])

[24.   60.   45.82]
[[13.5575  2.685 ]
 [37.44    4.645 ]
 [45.8797  3.5429]]
[16.68    42.815   38.35685]
[12.38875 33.85    52.72455]


In [None]:
print(calc_al_error(y_test[10], 0.1))

[2.0500000000000003, 4.82, 4.93]


In [None]:
# experiments
# mae, len, cov, time
# k-fold
# allowed_error []
#https://github.com/intelligence-csd-auth-gr/LionLearn/blob/master/LionForests_Multi/experiments/C3.%20WaterQuality.ipynb

# OLD

using MTR

In [None]:
MTR_obj = MTR(model=None, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=f_n, target_names=t_n)
rule = MTR_obj.explain(instance, 5) # you can add as last arguement the allowed error
featureLimits = MTR_obj.getFeatureLimits()
decisionsAndErrors = MTR_obj.getDecisionsAndErros()
print(decisionsAndErrors)
print(rule)

# this model will be used for L/G surrogate
model = MTR_obj.getModel()
predictions = model.predict(X_train)

allowed_error [5 5 5]
reduced_rules:  89 / 100
[[13.5575, 2.685], [37.44, 4.645], [45.8797, 3.5429]]
if 167.0<=Water<=168.5 & 904.0<=Coarse_Aggr<=917.5 & 0.0<=Slag<=0.05 & 801.5<=Fine_Aggr<=805.0 & 309.5<=Cemment<=310.0 & 142.5<=Fly_ash<=143.0 & 9.5<=SP<=10.0 then SLUMP_cm: 13.5575 +/- 2.685 error, FLOW_cm: 37.44 +/- 4.645 error, Compressive_Strength_Mpa: 45.8797 +/- 3.5429 error


using global surrogate

In [None]:
GS = GlobalSurrogateTree(X_train, predictions, f_n)
r, GSp = GS.rule(instance)
print(GS.rule(instance))

({'Water': [['<=', 182.25]], 'Slag': [['<=', 66.39999961853027]], 'Coarse_Aggr': [['<=', 1048.2999877929688], ['>', 904.0]], 'Fly_ash': [['<=', 210.9499969482422]]}, array([16.68   , 42.815  , 38.35685]))


using local surrogate

In [None]:
LS = LocalSurrogateTree(X_train, predictions, f_n, 40) # neigns should be >= 10
rl, LSp = LS.rule(instance)
print(LS.rule(instance))

({'Fly_ash': [['<=', 210.7123150630344], ['>', 34.602104331589345]], 'Water': [['<=', 181.24161005543618]], 'Cemment': [['>', 298.04265509501226]]}, array([12.38875, 33.85   , 52.72455]))
