In [61]:
from statistics import mean

import networkx as nx
import numpy as np
import os
import pandas as pd
import json
from numpy.linalg import inv
from tqdm import tqdm

In [18]:
class RAITIA2011:
    def __init__(self, data, categorical_nodes, gamma_max, sig_level, threshold_for_discretization_dict):
        self.graph = nx.DiGraph()
        # todo: verify categorical data
        # categorical_nodes = list(data.columns)
        # for tnode in data.columns:
        #     temp_var = []
        #     for v in data[tnode]:
        #         if v > threshold_for_discretization_dict[tnode][0]:
        #             temp_var.append(True)
        #         else:
        #             temp_var.append(False)
        #     print(tnode, sum(temp_var))
        #     data[tnode] = temp_var
        #     # threshold_for_discretization_dict[tnode] = []
        # print(data)


        self.qualitative_nodes = categorical_nodes
        self.quantitative_nodes = list(set(data.columns) - set(self.qualitative_nodes))

        self.gamma_max = gamma_max
        self.sig_level = sig_level
        self.threshold_for_discretization_dict = threshold_for_discretization_dict
        self.test_threshold_for_discretization()

        self.nodes_to_tnodes = dict()
        self.tnodes_to_thresholded_tnodes = dict()
        self.thresholded_tnodes_to_thresholded_nodes = dict()
        self.time_table = dict()
        self.tnodes = []
        for node in data.columns:
            self.nodes_to_tnodes[node] = []
            for gamma in range(2 * self.gamma_max + 1):
                if gamma == 0:
                    temporal_node = str(node) + "_t"
                    self.nodes_to_tnodes[node].append(temporal_node)
                    self.tnodes.append(temporal_node)
                    self.time_table[temporal_node] = gamma
                else:
                    temporal_node = str(node) + "_t_" + str(gamma)
                    self.nodes_to_tnodes[node].append(temporal_node)
                    self.tnodes.append(temporal_node)
                    self.time_table[temporal_node] = gamma

                ###################################################################################
                # create thresholded nodes names
                if len(threshold_for_discretization_dict[node]) > 1:
                    threshold_col_list = self.threshold_for_discretization_dict[node].copy()
                    threshold_col_list.sort()
                    for th in range(len(threshold_col_list)):
                        if th == 0:
                            thresholded_tnode = str(temporal_node) + "<" + str(round(threshold_col_list[th], ndigits=2))
                            thresholded_node = str(node) + "<" + str(round(threshold_col_list[th], ndigits=2))
                        else:
                            thresholded_tnode = str(temporal_node) + ">" + str(round(threshold_col_list[th], ndigits=2))
                            thresholded_node = str(node) + ">" + str(round(threshold_col_list[th], ndigits=2))
                        if temporal_node in self.tnodes_to_thresholded_tnodes.keys():
                            self.tnodes_to_thresholded_tnodes[temporal_node].append(thresholded_tnode)
                        else:
                            self.tnodes_to_thresholded_tnodes[temporal_node] = [thresholded_tnode]
                        self.thresholded_tnodes_to_thresholded_nodes[thresholded_tnode] = thresholded_node
                else:
                    thresholded_tnode = str(temporal_node) + ">" + str(round(self.threshold_for_discretization_dict[node][0], ndigits=2))
                    thresholded_node = str(node) + ">" + str(round(self.threshold_for_discretization_dict[node][0], ndigits=2))
                    if temporal_node in self.tnodes_to_thresholded_tnodes.keys():
                        self.tnodes_to_thresholded_tnodes[temporal_node].append(thresholded_tnode)
                    else:
                        self.tnodes_to_thresholded_tnodes[temporal_node] = [thresholded_tnode]
                    self.thresholded_tnodes_to_thresholded_nodes[thresholded_tnode] = thresholded_node
                ###################################################################################

        # create inverse dicts
        ###################################################################################
        self.tnodes_to_nodes = {v: k for k, v_list in self.nodes_to_tnodes.items() for v in v_list}
        self.thresholded_tnodes_to_tnodes = {v: k for k, v_list in
                                                    self.tnodes_to_thresholded_tnodes.items() for v in v_list}
        self.tnodes_or_thresholded_tnodes_to_nodes = self.tnodes_to_nodes.copy()
        for thresholded_nodes in self.thresholded_tnodes_to_tnodes.keys():
            self.tnodes_or_thresholded_tnodes_to_nodes[thresholded_nodes] = \
                self.tnodes_to_nodes[self.thresholded_tnodes_to_tnodes[thresholded_nodes]]
        ###################################################################################

        # print('#####################################')
        # print(self.tnodes)
        # print(self.nodes_to_tnodes)
        # print(self.tnodes_to_nodes)
        # print(self.tnodes_to_thresholded_tnodes)
        # print(self.thresholded_tnodes_to_tnodes)
        # print(self.tnodes_or_thresholded_tnodes_to_nodes)
        # print('#####################################')

        self.normal_nodes = []
        self.anomalous_nodes = []
        self.data = self._process_data(data)
        self.discretized_data = self._quantitative_to_qualitative()

        self.prima_facie_causes = dict()
        self.genuine_causes = dict()
        self.root_causes = []

    def test_threshold_for_discretization(self):
        for k in self.threshold_for_discretization_dict.keys():
            if len(self.threshold_for_discretization_dict[k]) > 2:
                print("Error: too many thresholds for time series (" + str(k) + "). Max thresholds allowed is 2.")
                exit(0)

    # Todo: tansform series with mutltiple threshold into many binary variables (with true false values)
    def _quantitative_to_qualitative(self):
        discretized_variables = []
        for tnode in self.data.columns:
            if self.tnodes_to_nodes[tnode] in self.quantitative_nodes:
                discretized_variables.append(tnode)
        discretized_data = pd.DataFrame(columns=discretized_variables)
        tnodes_to_eliminate = []
        for tnode in discretized_data.columns:
            discretized_data_col = []
            threshold_col_list = self.threshold_for_discretization_dict[self.tnodes_to_nodes[tnode]].copy()
            threshold_col_list.sort()
            if len(threshold_col_list) == 1:
                for v in self.data[tnode]:
                    if v > threshold_col_list[0]:
                        discretized_data_col.append(True)
                    else:
                        discretized_data_col.append(False)

                if len(set(discretized_data_col)) == 1:
                    self.normal_nodes.append(self.tnodes_to_thresholded_tnodes[tnode][0])
                else:
                    self.anomalous_nodes.append(self.tnodes_to_thresholded_tnodes[tnode][0])
                discretized_data[self.tnodes_to_thresholded_tnodes[tnode][0]] = discretized_data_col
            else:
                for v in self.data[tnode]:
                    #######################################################################################
                    # first approach: distinguish between top outliers and down outliers
                    one_vs_many = [False] * 2
                    if v < threshold_col_list[0]:
                        one_vs_many[0] = True
                    elif v > threshold_col_list[1]:
                        one_vs_many[1] = True
                    discretized_data_col.append(one_vs_many)
                    #######################################################################################
                    # second approach: treat top outliers and down outliers as the same outlier
                    # if (v < threshold_col_list[0]) or (v > threshold_col_list[1]):
                    #     discretized_data_col.append(True)
                    # else:
                    #     discretized_data_col.append(False)
                    #######################################################################################
                discretized_data_col = np.array(discretized_data_col)
                for i in range(len(threshold_col_list)):
                    if len(set(discretized_data_col[:, i])) != 1:
                        discretized_data[self.tnodes_to_thresholded_tnodes[tnode][i]] = \
                            discretized_data_col[:, i]
                    if len(set(discretized_data_col[:, i])) == 1:
                        self.normal_nodes.append(self.tnodes_to_thresholded_tnodes[tnode][i])
                    else:
                        self.anomalous_nodes.append(self.tnodes_to_thresholded_tnodes[tnode][i])
            if tnode not in tnodes_to_eliminate:
                tnodes_to_eliminate.append(tnode)
        discretized_data.drop(tnodes_to_eliminate, axis=1, inplace=True)
        return discretized_data

    def _process_data(self, data):
        new_data = pd.DataFrame()
        # for gamma in range(0, 2 * self.gamma_max + 1):
        #     shifted_data = data.shift(periods=-2 * self.gamma_max + gamma)
        for gamma in range(0, self.gamma_max + 1):
            shifted_data = data.shift(periods=self.gamma_max + gamma)
            new_columns = []
            for node in data.columns:
                new_columns.append(self.nodes_to_tnodes[node][gamma])
            shifted_data.columns = new_columns
            new_data = pd.concat([new_data, shifted_data], axis=1, join="outer")
        new_data.dropna(axis=0, inplace=True)
        return new_data

    def is_prima_facie(self, temporal_effect, temporal_or_thresholded_cause):
        effect_value = list(self.data[temporal_effect])

        if self.tnodes_or_thresholded_tnodes_to_nodes[temporal_or_thresholded_cause] in self.quantitative_nodes:
            cause_value = list(self.discretized_data[temporal_or_thresholded_cause])
        else:
            cause_value = list(self.data[temporal_or_thresholded_cause])

        if self.tnodes_or_thresholded_tnodes_to_nodes[temporal_effect] in self.quantitative_nodes:
            mean_e = mean(effect_value)
            list_e_c = []
            for (c, e) in zip(cause_value, effect_value):
                if c == True:
                    list_e_c.append(e)
            # return(stats.ttest_ind(effect_value, list_e_c, permutations=500, equal_var=False)[1] <= 0.05)
            return mean(list_e_c) != mean_e
        else:
            c_and_e = sum([c and e for (c, e) in zip(cause_value, effect_value)])
            c_true = sum(cause_value)
            e_true = sum(effect_value)
            events = len(effect_value)
            if c_true == 0:
                return False
            result = (c_and_e / c_true) != (e_true / events)
            return result

    def find_prima_facie_causes(self):
        nodes_t = [node for node in self.data.columns if self.time_table[node] == 0]
        for temporal_effect in nodes_t:
            for temporal_cause in self.data.columns:
                # condition on temporal priority between cause and effect
                if self.time_table[temporal_effect] - self.time_table[temporal_cause] < 0:
                    if temporal_effect != temporal_cause:
                        # take into account multi thresholding
                        if temporal_cause in self.tnodes_to_thresholded_tnodes.keys():
                            list_temporal_or_thresholded_causes = \
                                self.tnodes_to_thresholded_tnodes[temporal_cause]
                        else:
                            list_temporal_or_thresholded_causes = [temporal_cause]
                        for temporal_or_thresholded_cause in list_temporal_or_thresholded_causes:
                            if temporal_or_thresholded_cause in self.anomalous_nodes:
                                if self.is_prima_facie(temporal_effect, temporal_or_thresholded_cause):
                                    if temporal_effect in self.prima_facie_causes:
                                        self.prima_facie_causes[temporal_effect].append(temporal_or_thresholded_cause)
                                    else:
                                        self.prima_facie_causes[temporal_effect] = [temporal_or_thresholded_cause]

    def get_other_causes(self, temporal_or_thresholded_cause, temporal_effect):
        # if temporal_or_thresholded_cause in self.thresholded_tnodes_to_tnodes.keys():
        #     temporal_cause = self.thresholded_tnodes_to_tnodes[temporal_or_thresholded_cause]
        #     main_cause = self.tnodes_to_thresholded_tnodes[temporal_cause]
        # else:
        #     main_cause = [temporal_or_thresholded_cause]
        main_cause = [temporal_or_thresholded_cause]
        other_causes = [cause for cause in self.prima_facie_causes[temporal_effect] if
                        cause not in main_cause]
        return other_causes

    def calculate_probability_difference_2011_no_x(self, temporal_or_thresholded_cause, temporal_effect):
        effect_value = list(self.data[temporal_effect])

        if self.tnodes_or_thresholded_tnodes_to_nodes[temporal_or_thresholded_cause] in self.quantitative_nodes:
            cause_value = list(self.discretized_data[temporal_or_thresholded_cause])
        else:
            cause_value = list(self.data[temporal_or_thresholded_cause])

        if self.tnodes_to_nodes[temporal_effect] in self.qualitative_nodes:
            e_and_c = [e and c for (e, c) in zip(effect_value, cause_value)]

            e_and_not_c = [e and (not c) for (e, c) in zip(effect_value, cause_value)]

            return sum(e_and_c) - sum(e_and_not_c)
        else:
            list_e_c = []
            for (e, cx) in zip(effect_value, cause_value):
                if cx == True:
                    list_e_c.append(e)

            not_c = [(not c) for c in cause_value]
            list_e_not_c = []
            for (e, ncx) in zip(effect_value, not_c):
                if ncx == True:
                    list_e_not_c.append(e)
            if len(list_e_c) == 0:
                mean_e_c = 0
            else:
                mean_e_c = mean(list_e_c)
            if len(list_e_not_c) == 0:
                mean_e_not_c = 0
            else:
                mean_e_not_c = mean(list_e_not_c)
            return mean_e_c - mean_e_not_c

    def calculate_probability_difference_2011(self, temporal_or_thresholded_cause, temporal_effect, x):
        effect_value = list(self.data[temporal_effect])

        if self.tnodes_or_thresholded_tnodes_to_nodes[temporal_or_thresholded_cause] in self.quantitative_nodes:
            cause_value = list(self.discretized_data[temporal_or_thresholded_cause])
        else:
            cause_value = list(self.data[temporal_or_thresholded_cause])
        if self.tnodes_or_thresholded_tnodes_to_nodes[x] in self.quantitative_nodes:
            x_value = list(self.discretized_data[x])
        else:
            x_value = list(self.data[x])

        if self.tnodes_to_nodes[temporal_effect] in self.qualitative_nodes:
            c_and_x = [c and x for (c, x) in zip(cause_value, x_value)]
            e_and_c_and_x = [e and cx for (e, cx) in zip(effect_value, c_and_x)]

            not_c_and_x = [(not c) and x for (c, x) in zip(cause_value, x_value)]
            e_and_not_c_and_x = [e and ncx for (e, ncx) in zip(effect_value, not_c_and_x)]

            if sum(c_and_x) == 0 or sum(not_c_and_x) == 0:
                return None

            return sum(e_and_c_and_x) / sum(c_and_x) - sum(e_and_not_c_and_x) / sum(not_c_and_x)
        else:
            c_and_x = [c and x for (c, x) in zip(cause_value, x_value)]
            list_e_c = []
            for (e, cx) in zip(effect_value, c_and_x):
                if cx == True:
                    list_e_c.append(e)

            not_c_and_x = [(not c) and x for (c, x) in zip(cause_value, x_value)]
            list_e_not_c = []
            for (e, ncx) in zip(effect_value, not_c_and_x):
                if ncx == True:
                    list_e_not_c.append(e)
            if len(list_e_c) == 0:
                mean_e_c = 0
            else:
                mean_e_c = mean(list_e_c)
            if len(list_e_not_c) == 0:
                mean_e_not_c = 0
            else:
                mean_e_not_c = mean(list_e_not_c)
            return mean_e_c - mean_e_not_c

    # Auxiliary method
    def get_epsilon_average_2011(self, temporal_or_thresholded_cause, temporal_effect):
        other_causes = self.get_other_causes(temporal_or_thresholded_cause, temporal_effect)
        eps_x = 0
        if len(other_causes) != 0:
            for x in other_causes:
                eps_result = self.calculate_probability_difference_2011(temporal_or_thresholded_cause, temporal_effect,
                                                                        x)
                if eps_result == None:
                    return None
                eps_x += eps_result
            eps_avg = eps_x / len(other_causes)
            return eps_avg
        else:
            return self.calculate_probability_difference_2011_no_x(temporal_or_thresholded_cause, temporal_effect)

    # Main function of logic-based method of 2011
    def do_all_epsilon_averages_2011(self):
        list_epsilon = {}
        for temporal_effect in self.prima_facie_causes:
            for temporal_or_thresholded_cause in self.prima_facie_causes[temporal_effect]:
                list_epsilon[(temporal_or_thresholded_cause, temporal_effect)] = \
                    self.get_epsilon_average_2011(temporal_or_thresholded_cause, temporal_effect)
        return list_epsilon

    def find_genuine_causes(self, all_epsilon_averages):
        for ce in all_epsilon_averages.keys():
            if abs(all_epsilon_averages[ce]) >= self.sig_level:
                if ce[1] in self.genuine_causes:
                    self.genuine_causes[ce[1]].append(ce[0])
                else:
                    self.genuine_causes[ce[1]] = [ce[0]]
                    
    def find_prob_of_root_causes(self, epsilon):
        graph = self.construct_summary_graph(plot=False)
        prob_root = {}
        for child in graph.nodes:
            prob_root[child] = {}
            prob_root[child][child] = pow(epsilon,len(list(graph.predecessors(child))))
            for root in nx.ancestors(graph, child):
                all_paths = list(nx.all_simple_paths(graph, source=root, target=child))
                prob = 0
                for path in all_paths:
                    prob += pow(1-epsilon,len(path)-1)
                if len(list(graph.predecessors(root))) != 0:
                    prob *= pow(epsilon,len(list(graph.predecessors(root))))
                prob_root[child][root] = prob
            z = sum(prob_root[child].values())
            for root in prob_root[child].keys():
                prob_root[child][root] = prob_root[child][root]/z
        return prob_root
    
    # todo
    def find_root_causes_of_anomalies(self):
        summary_graph = self.construct_summary_graph(plot=True)
        for node in summary_graph.nodes:
            parents_of_node = list(summary_graph.predecessors(node))
            if len(parents_of_node) == 0:
                self.root_causes.append(node)
            else:
                if (len(parents_of_node) == 1) and parents_of_node[0] == node:
                    self.root_causes.append(node)

    def construct_temporal_graph(self, plot=True):
        list_nodes_1 = []
        list_edges_1 = []

        for temporal_node in self.tnodes:
            if self.time_table[temporal_node] == 0:
                list_nodes_1.append(temporal_node)

        for effect in self.genuine_causes.keys():
            for cause in self.genuine_causes[effect]:
                if cause in self.thresholded_tnodes_to_tnodes.keys():
                    cause = self.thresholded_tnodes_to_tnodes[cause]
                if cause not in list_nodes_1:
                    list_nodes_1.append(cause)
                if (cause, effect) not in list_edges_1:
                    list_edges_1.append((cause, effect))

        temporal_graph = nx.DiGraph()
        temporal_graph.add_nodes_from(list_nodes_1)
        temporal_graph.add_edges_from(list_edges_1)
        pos = dict()
        all_nodes = self.qualitative_nodes + self.quantitative_nodes
        for temporal_node in list_nodes_1:
            node = self.tnodes_to_nodes[temporal_node]
            y = all_nodes.index(node)
            pos[temporal_node] = [-self.time_table[temporal_node], y]

        if plot:
            nx.draw(temporal_graph, pos, with_labels=True)
            plt.show()
        return temporal_graph

    def construct_summary_graph(self, plot=True):
        list_nodes_1 = []
        list_edges_1 = []
        for node in self.qualitative_nodes + self.quantitative_nodes:
            list_nodes_1.append(node)

        for effect_t in self.genuine_causes.keys():
            for cause_t in self.genuine_causes[effect_t]:
                effect = self.tnodes_to_nodes[effect_t]
                cause = self.tnodes_or_thresholded_tnodes_to_nodes[cause_t]
                if (cause, effect) not in list_edges_1:
                    list_edges_1.append((cause, effect))

        summary_graph = nx.DiGraph()
        summary_graph.add_nodes_from(list_nodes_1)
        summary_graph.add_edges_from(list_edges_1)
        if plot:
            nx.draw(summary_graph, with_labels=True)
            plt.show()
        return summary_graph

    def construct_temporal_outlier_graph(self, plot=True):
        # todo adapt to multi threshold
        list_nodes_0 = []
        # list_nodes_1 = []
        list_edges_1 = []
        for temporal_node in self.tnodes:
            if self.time_table[temporal_node] == 0:
                list_nodes_0.append(temporal_node)

        for effect_t in self.genuine_causes.keys():
            for cause_tt in self.genuine_causes[effect_t]:
                list_nodes_0.append(cause_tt)
                list_edges_1.append((cause_tt, effect_t))

        temporal_outlier_graph = nx.DiGraph()
        temporal_outlier_graph.add_nodes_from(list_nodes_0)
        temporal_outlier_graph.add_edges_from(list_edges_1)
        pos = dict()
        all_nodes = []
        all_thresholded_nodes = dict()
        for t in range(1, self.gamma_max + 1):
            all_thresholded_nodes[t] = []
        for node in self.qualitative_nodes + self.quantitative_nodes:
            for node_t_ in self.nodes_to_tnodes[node]:
                if 0 < self.time_table[node_t_] <= self.gamma_max:
                    if node_t_ in self.tnodes_to_thresholded_tnodes.keys():
                        all_thresholded_nodes[self.time_table[node_t_]] = \
                            all_thresholded_nodes[self.time_table[node_t_]] + \
                            self.tnodes_to_thresholded_tnodes[node_t_] + [None]
                        if node not in all_nodes:
                            all_nodes = all_nodes + [node] * len(self.tnodes_to_thresholded_tnodes[node_t_]) + [None]
                    else:
                        all_thresholded_nodes[self.time_table[node_t_]] = \
                            all_thresholded_nodes[self.time_table[node_t_]] + [node_t_] + [None]
                        if node not in all_nodes:
                            all_nodes = all_nodes + [node] + [None]
        for node_tt in list_nodes_0:
            node = self.tnodes_or_thresholded_tnodes_to_nodes[node_tt]
            if node_tt in self.thresholded_tnodes_to_tnodes.keys():
                node_t_ = self.thresholded_tnodes_to_tnodes[node_tt]
            else:
                node_t_ = node_tt
            if self.time_table[node_t_] > 0:
                y = all_thresholded_nodes[self.time_table[node_t_]].index(node_tt)
            else:
                y = all_nodes.index(node)
            pos[node_tt] = [-self.time_table[node_t_], y]
        if plot:
            nx.draw(temporal_outlier_graph, pos, with_labels=True)
            plt.show()
        return temporal_outlier_graph

    def construct_outlier_graph(self, plot=True):
        list_effects = []
        list_causes = []
        list_edges_1 = []
        for node in self.qualitative_nodes + self.quantitative_nodes:
            list_effects.append(node)

        processed_causes_to_causes = dict()
        for teffect in self.genuine_causes.keys():
            effect = self.tnodes_to_nodes[teffect]
            for tnodes_or_thresholded_tnodes in self.genuine_causes[teffect]:
                cause = self.tnodes_or_thresholded_tnodes_to_nodes[tnodes_or_thresholded_tnodes]
                if len(self.threshold_for_discretization_dict[cause]) > 0:
                    processed_cause = self.thresholded_tnodes_to_thresholded_nodes[tnodes_or_thresholded_tnodes]
                else:
                    processed_cause = cause + "--"
                processed_causes_to_causes[processed_cause] = cause
                if processed_cause not in list_causes:
                    list_causes.append(processed_cause)
                if (processed_cause, effect) not in list_edges_1:
                    list_edges_1.append((processed_cause, effect))

        outlier_graph = nx.DiGraph()
        outlier_graph.add_nodes_from(list_effects)
        outlier_graph.add_nodes_from(list_causes)
        outlier_graph.add_edges_from(list_edges_1)

        pos = dict()
        for effect in list_effects:
            # cause = processed_causes_to_causes[processed_cause]
            y = list_effects.index(effect)
            pos[effect] = [1, y]
        for processed_cause in list_causes:
            y = list_causes.index(processed_cause)
            pos[processed_cause] = [0, y]

        if plot:
            nx.draw(outlier_graph, pos, with_labels=True)
            # nx.draw(G, with_labels=True, pos=nx.drawing.layout.bipartite_layout(G, list_nodes_1),)
            plt.show()
        return outlier_graph

In [None]:
class RAITIA2015:
    def __init__(self, data, categorical_nodes, gamma_max, sig_level, threshold_for_discretization_dict):
        self.graph = nx.DiGraph()

        self.data = data
        self.qualitative_nodes = categorical_nodes
        self.quantitative_nodes = list(set(data.columns) - set(self.qualitative_nodes))
        
        self.categorical_nodes = categorical_nodes
        self.gamma_min = 1
        self.gamma_max = gamma_max
        self.sig_level = sig_level
        self.threshold_for_discretization_dict = threshold_for_discretization_dict
        self.dic_prima_facie_causes = None
        self.column_to_index = {}
        self.index_to_column = {}
        for i in range(len(data.columns)):
            self.column_to_index[data.columns[i]] = i
            self.index_to_column[i] = data.columns[i]
        
    
    def generate_dic_prima_facie_cause(self, root_cause):
        dic_prima_facie_causes = {}
        for node in root_cause.keys():
            dic_prima_facie_causes[node.split('_')[0]] = []
            for cause in root_cause[node]:
                dic_prima_facie_causes[node.split('_')[0]].append(cause.split('_')[0])
            dic_prima_facie_causes[node.split('_')[0]] = list(set(dic_prima_facie_causes[node.split('_')[0]]))
            
        return dic_prima_facie_causes

    def return_ele_B(self, cause, thres_cause, effect, lag_min, lag_max):
        T_e = sampling_number = len(effect)
        T_e_c = None
        N_e_c = None
        E_e_c = None

        list_E_e_c = []
        list_N_e_c = []

        index = 0
        for i in cause:
            if i >= thres_cause:
                for lag in range(lag_min, lag_max+1):
                    if index + lag < sampling_number:
                        list_N_e_c.append(index+lag)
                        list_E_e_c.append(effect[index+lag])
            index+=1
        N_e_c = len(list_N_e_c)
        T_e_c = len(set(list_N_e_c))
        E_e_c = mean(list_E_e_c)
        E_e = mean(effect)

        f_e_c = (T_e * T_e_c)/(N_e_c*(T_e - T_e_c))

        return f_e_c*(E_e_c-E_e)


    def return_ele_A(self, cause, thres_cause, x, thres_x, effect, lag_min, lag_max):
        T_e = sampling_number = len(effect)
        T_e_c = None
        N_e_c = None
        N_e_x = None
        N_e_c_x = 0

        list_N_e_c = []
        list_N_e_x = []

        index = 0
        for i in cause:
            if i >= thres_cause:
                for lag in range(lag_min, lag_max+1):
                    if index + lag < sampling_number:
                        list_N_e_c.append(index+lag)
            index+=1

        list_T_e_c = set(list_N_e_c)
        N_e_c = len(list_N_e_c)
        T_e_c = len(list_T_e_c)

        index = 0
        for i in x:
            win_eles = []
            if i >= thres_x:
                for lag in range(lag_min, lag_max+1):
                    if index + lag < sampling_number:
                        list_N_e_x.append(index+lag)
                        win_eles.append(index+lag)
            N_e_c_x += len(set(win_eles) & list_T_e_c)
            index+=1

        N_e_x = len(list_N_e_x)

        return (N_e_c_x*T_e - N_e_x*T_e_c)/(N_e_c*(T_e - T_e_c))


    def causal_significance_estimation(self, data, poten_cause_index, thres_for_causes, effect_index, lag_min, lag_max):
        num_poten_cause = len(poten_cause_index)
        data_value = data.values

        A = np.zeros(shape=(num_poten_cause, num_poten_cause))
        B = np.zeros(shape=(num_poten_cause,1))

        for i in range(num_poten_cause):
            B[i,0] = self.return_ele_B(cause=data_value[:,i], thres_cause=thres_for_causes[i],
                                  effect=data_value[:,effect_index], lag_min=lag_min, lag_max=lag_max)
            for m in range(num_poten_cause):
                A[i,m] = self.return_ele_A(cause=data_value[:,i], thres_cause=thres_for_causes[i],
                                      x=data_value[:,m], thres_x=thres_for_causes[m],
                                      effect=data_value[:,effect_index], lag_min=lag_min, lag_max=lag_max)

        return np.matmul(inv(A),B)

    def generate_outlier_graph(self):
        # generate the outlier summary causal graph 
        outlier_graph = nx.DiGraph()
        outlier_graph.add_nodes_from(self.data.columns)
        
        ai2011 = RAITIA2011(data=self.data, categorical_nodes=self.categorical_nodes, gamma_max=self.gamma_max, sig_level=self.sig_level,
                        threshold_for_discretization_dict=self.threshold_for_discretization_dict)
        ai2011.find_prima_facie_causes()
        # print(ai2011.prima_facie_causes)
        self.dic_prima_facie_causes = self.generate_dic_prima_facie_cause(root_cause=ai2011.prima_facie_causes)
        
        for effect in self.dic_prima_facie_causes.keys():
            effect_index = self.column_to_index[effect]
            poten_cause_index = [self.column_to_index[i] for i in self.dic_prima_facie_causes[effect] if i != effect]
            # poten_cause_index = [self.column_to_index[i] for i in self.dic_prima_facie_causes[effect]]
            thres_for_causes = [self.threshold_for_discretization_dict[i] for i in self.dic_prima_facie_causes[effect]]
            vec_sig_level = self.causal_significance_estimation(data=self.data, poten_cause_index=poten_cause_index, 
                                                                thres_for_causes=thres_for_causes, effect_index=effect_index, 
                                                                lag_min=self.gamma_min, lag_max=self.gamma_max)
            for link_index in range(vec_sig_level.shape[0]):
                if np.abs(vec_sig_level[link_index,0]) >= self.sig_level:
                    outlier_graph.add_edge(self.index_to_column[poten_cause_index[link_index]], effect)
        return outlier_graph
                    
    def find_root_causes(self, outlier_graph):            
        # find the root cause
        list_root_causes = []
        for node in outlier_graph.nodes: 
            if len(list(outlier_graph.predecessors(node))) == 0:
                list_root_causes.append(node)
        return list_root_causes

In [20]:
data_folder_path = os.path.join('..', 'RCA_simulated_data', 'certain', 'data')
data_files = [os.path.join(data_folder_path, f) for f in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path, f))]

data_path = data_files[0]
param_data = pd.read_csv(data_path)

graph_path = os.path.join('..', 'RCA_simulated_data', 'certain', 'data_info', data_path.split('/')[-1].replace('data', 'info').replace('csv', 'json'))
with open(graph_path, 'r') as json_file:
    json_graph = json.load(json_file)
param_threshold_dict = json_graph['nodes_thres']

In [58]:
# ai5 = RAITIA2015(data=param_data, categorical_nodes=[], gamma_max=3, sig_level=0.05, threshold_for_discretization_dict=param_threshold_dict)
# outlier_graph = ai5.generate_outlier_graph()
# pred_root_causes= ai5.find_root_causes(outlier_graph=outlier_graph)

In [70]:
gamma_max = 1

data_folder_path = os.path.join('..', 'RCA_simulated_data', 'certain', 'historical_data')
data_files = [os.path.join(data_folder_path, f) for f in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path, f))]


def cal_precision_recall(ground_truth, predicion):
    pred_num = len(predicion)
    truth_num = len(ground_truth)

    true_pred = 0
    for node in predicion:
        if node in ground_truth:
            true_pred+=1
    if pred_num == 0:
        return (0, true_pred/truth_num)
    else:
        return (true_pred/pred_num, true_pred/truth_num)

res = {}
for sig_level in np.arange(0.05, 0.55, 0.05).tolist():
    Pre = []
    Recall = []
    F1 = []
    for data_path in tqdm(data_files):
        categorical_nodes = []
        param_data = pd.read_csv(data_path)
        data_info = os.path.join('..', 'RCA_simulated_data', 'certain', 'data_info', data_path.split('/')[-1].replace('data', 'info').replace('csv', 'json'))
        with open(data_info, 'r') as json_file:
            data_info = json.load(json_file)
        param_threshold_dict = data_info['nodes_thres']
        true_root_causes = data_info['intervention_node']

        ai5 = RAITIA2015(data=param_data, categorical_nodes=categorical_nodes, gamma_max=gamma_max, sig_level=sig_level, threshold_for_discretization_dict=param_threshold_dict)
        outlier_graph = ai5.generate_outlier_graph()
        pred_root_causes= ai5.find_root_causes(outlier_graph=outlier_graph)

        pre, recall = cal_precision_recall(ground_truth=true_root_causes, predicion=pred_root_causes)
        Pre.append(pre)
        Recall.append(recall)
        if pre+recall == 0:
            F1.append(0)
        else:
            F1.append(2*pre*recall/(pre+recall))

    res[str(sig_level)] = (np.mean(Pre), np.mean(Recall), np.mean(F1))
    print('precison: ' + str(np.mean(Pre)))
    print('recall: ' + str(np.mean(Recall)))
    print('F1: ' + str(np.mean(F1)))

# res_path = os.path.join('..', 'Results', '2015_varying_sig_level.json')
# with open(res_path, 'w') as json_file:
#     json.dump(res, json_file)

  0%|          | 0/50 [00:00<?, ?it/s]

{'a_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87'], 'b_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87'], 'd_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87'], 'c_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87'], 'e_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87'], 'f_t': ['a_t_1>0.71', 'b_t_1>0.84', 'd_t_1>0.75', 'c_t_1>0.86', 'e_t_1>0.89', 'f_t_1>0.87']}


  2%|▏         | 1/50 [00:01<01:29,  1.84s/it]

{'a_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86'], 'b_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86'], 'c_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86'], 'f_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86'], 'd_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86'], 'e_t': ['a_t_1>0.87', 'b_t_1>0.74', 'c_t_1>0.84', 'f_t_1>0.78', 'd_t_1>0.73', 'e_t_1>0.86']}


  4%|▍         | 2/50 [00:03<01:26,  1.80s/it]

{'a_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89'], 'b_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89'], 'f_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89'], 'c_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89'], 'd_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89'], 'e_t': ['a_t_1>0.88', 'b_t_1>0.76', 'f_t_1>0.7', 'c_t_1>0.84', 'd_t_1>0.78', 'e_t_1>0.89']}


  4%|▍         | 2/50 [00:04<01:51,  2.33s/it]


KeyboardInterrupt: 