In [4]:
import copy
import json
import os
import random

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm 

In [5]:
def dict_to_graph(graph_dict, inter_nodes):
    # Create an empty directed graph
    graph = nx.DiGraph()

    # Iterate through the dictionary and add nodes and edges to the graph
    for parent, children in graph_dict.items():
        # Add the parent node to the graph
        graph.add_node(parent)

        # Iterate through the children of the parent
        for child in children.keys():
            # Add the child node to the graph and create a directed edge from parent to child
            graph.add_node(child)
            if child not in inter_nodes:
                graph.add_edge(parent, child)
    return graph

def check_impact_of_intervention_node(graph, inter_node, data, dict_nodes_thres, dict_edges_lag, n):
    descendants_list = list(nx.descendants(graph, inter_node))
    index_inter = list(data[data[inter_node] > dict_nodes_thres[inter_node][0]].index)
    for node in descendants_list:
        shortest_path = nx.shortest_path(graph, source=inter_node, target=node)
        lag = 0
        for i in range(len(shortest_path)-1):
            lag+=dict_edges_lag[str((shortest_path[i],shortest_path[i+1]))]
        index_node = [i+lag for i in index_inter if i+lag<n]
        print(node, np.sum(data.loc[index_node][node] < dict_nodes_thres[node][0]))

In [6]:
# graphs_path: path for graphs
# save_data_path: path to save generated data
# save_info_path: path to save thresholds of nodes and lags of edges
# n : number of sampling points
# gamma_min: minimum lag
# gamma_max: maximum lag
def generate_historical_data_by_folder_SK(graphs_path, save_data_path, save_info_path, n, thres_min=0.7, thres_max=0.9, gamma_min=1, gamma_max=1, prob_inter=0.3, certain=True, epsilon=0.3):
    # np.random.seed(seed=seed)
    if not os.path.exists(save_data_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_data_path)
    # check the existence of data information folder
    if not os.path.exists(save_info_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_info_path)
        
    #################################################################
    #################################################################
    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in tqdm(graph_files):
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        dict_nodes_thres = {}
        dict_nodes_base = {}
        coffes_edges = {}
        children_list = []
        dict_edges_lag = {}
        
        for node in nodes_list:
            dict_nodes_base[node] = [np.round(np.random.uniform(low=0, high=0.1), 2)]
            dict_nodes_thres[node] = [np.round(np.random.uniform(low=thres_min, high=thres_max), 2)]
            
            
        for edge in edges_list:
            dict_edges_lag[edge] = np.random.randint(low=gamma_min, high=gamma_max+1)
            coffes_edges[edge] = dict_nodes_thres[edge[1]][0]
            children_list.append(edge[1])

        children_list = list(set(children_list))

        for node in nodes_list:
            if node not in children_list:
                data[node] = []
                for i in range(n):
                    if np.random.uniform() < 1- prob_inter:
                        data[node].append(dict_nodes_base[node][0])
                    else: 
                        data[node].append(np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1))
            else:
                data[node] = []

        # propagation of interventions
        for i in range(n):
            for node in topological_order:
                parents = list(graph.predecessors(node))
                if len(parents) != 0:
                    if np.random.uniform() < 1- prob_inter:
                        value = dict_nodes_base[node][0]
                        for par in parents:
                            lag = dict_edges_lag[(par, node)]
                            if i-lag < 0:
                                value += 0
                            elif data[par][i-lag] >= dict_nodes_thres[par][0]:
                                value += coffes_edges[(par,node)]
                        if not certain:
                            if value >= dict_nodes_thres[node][0]:
                                if np.random.uniform() < 1-epsilon:
                                    value += 0
                                else:
                                    value = dict_nodes_base[node][0] 
                        data[node].append(value)
                    else:
                        data[node].append(np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1))
                else:
                    continue

        data = pd.DataFrame(data)
        
        
        edges_lag = {}
        for key,value in dict_edges_lag.items():
            edges_lag[str(key)] = value
        
        edges_coffe = {}
        for key,value in coffes_edges.items():
            edges_coffe[str(key)] = value

        info = {'nodes_thres': dict_nodes_thres, 'nodes_base': dict_nodes_base, 'edges_lag': edges_lag, 'edges_coffe': edges_coffe}

        data.to_csv(os.path.join(save_data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        save_data_info_path = os.path.join(save_info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(save_data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [4]:
graphs_path = 'graphs'
save_data_path = os.path.join('certain_SK', 'historical_data')
save_info_path = os.path.join('certain_SK', 'data_info')
n = 2000
thres_min=0.7 
thres_max=0.9
gamma_min = 1 
gamma_max = 1 
prob_inter = 0.3 
epsilon = 0  
certain = True

In [7]:
generate_historical_data_by_folder_SK(graphs_path=graphs_path, save_data_path=save_data_path, save_info_path=save_info_path, 
                                      n=n, thres_min=thres_min, thres_max=thres_max, gamma_min=gamma_min, gamma_max=gamma_max, 
                                      prob_inter=prob_inter, certain=certain, epsilon=epsilon)

In [7]:
def generate_simulation_data_by_folder_SK(graphs_path, info_path, save_data_path, save_info_path, n, num_inters=1, certain=True, epsilon=0.3):
    # np.random.seed(seed=seed)
    if not os.path.exists(save_data_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_data_path)

    if not os.path.exists(save_info_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_info_path)

    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in tqdm(graph_files):
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        data_info_path = os.path.join(info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        with open(data_info_path, 'r') as json_file:
            data_info = json.load(json_file)

        dict_nodes_thres = data_info['nodes_thres']
        dict_nodes_base = data_info['nodes_base']
        dict_edges_lag = data_info['edges_lag']
        dict_edges_coffe = data_info['edges_coffe']
        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])
        if num_inters == 1:
            inter_nodes = np.random.choice(graph.nodes, size=num_inters, replace=False)
        else:
            while True:
                inter_nodes = np.random.choice(graph.nodes, size=num_inters, replace=False)
                in_same_path = False
                for node_1 in inter_nodes:
                    for node_2 in inter_nodes:
                        if node_1 != node_2:
                            if node_1 in nx.ancestors(graph, node_2): # and node_2 not in nx.ancestors(graph, node_1)):
                                in_same_path = True
                if not in_same_path:
                    break

        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=inter_nodes)

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        children_list = []

        for edge in edges_list:
            children_list.append(edge[1])

        children_list = list(set(children_list))

        for node in nodes_list:
            if node in inter_nodes:
                data[node] = np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1, size=n)
            elif node not in children_list and node not in inter_nodes:
                data[node] = [dict_nodes_base[node][0] for i in range(n)]
            else:
                data[node] = []
        
        # transfer interventions
        for i in range(n):
            for node in topological_order:
                parents = list(graph.predecessors(node))
                if len(parents) != 0:
                    value = dict_nodes_base[node][0]
                    for par in parents:
                        lag = dict_edges_lag[str((par, node))]
                        if i-lag < 0:
                            value += 0
                        elif data[par][i-lag] >= dict_nodes_thres[par][0]:
                            value += dict_edges_coffe[str((par,node))]
                    if not certain:
                        if value >= dict_nodes_thres[node][0]:
                            if np.random.uniform() < 1-epsilon:
                                value += 0
                            else:
                                value = dict_nodes_base[node][0] 
                    data[node].append(value)
                else:
                    continue
        
        data = pd.DataFrame(data)

        info = {'nodes_thres': dict_nodes_thres, 'nodes_base': dict_nodes_base, 'edges_lag': dict_edges_lag, 'edges_coffe': dict_edges_coffe, 'intervention_node': list(inter_nodes)}

        data.to_csv(os.path.join(save_data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        data_info_path = os.path.join(save_info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [8]:
graphs_path = 'graphs'
info_path = os.path.join('certain_SK', 'data_info')
n = 500 # sampling points
num_inters = 1
certain = True
epsilon = 0
# save_data_path = os.path.join('certain_', 'data')
# save_info_path = os.path.join('certain_', 'data_info')
save_data_path = os.path.join('certain_SK', 'actual_data_'+str(num_inters)+'_inters')
save_info_path = os.path.join('certain_SK', 'data_info_'+str(num_inters)+'_inters')
generate_simulation_data_by_folder_SK(graphs_path=graphs_path, info_path=info_path, save_data_path=save_data_path, save_info_path=save_info_path, 
                                      n=n, num_inters=num_inters, certain=certain, epsilon=epsilon)

100%|██████████| 50/50 [00:00<00:00, 156.72it/s]


In [45]:
coffes_edges

{('a', 'b'): 0.7775816799346464,
 ('a', 'f'): 0.7407277949069354,
 ('b', 'c'): 0.1321493793871566,
 ('b', 'd'): 0.26484011777821814,
 ('c', 'd'): 0.3692049991008318,
 ('c', 'f'): 0.8241110587328954,
 ('d', 'e'): 0.8144996106075356,
 ('d', 'f'): 0.8304148711547787}

In [21]:
# graphs_path: path for graphs
# interventions_path: path of the intervention node
# data_path: path to save generated data
# info_path: path to save thresholds of nodes and lags of edges
# n : number of sampling points
# gamma_min: minimum lag
# gamma_max: maximum lag
# thres_min: minimum threshold for all nodes
# thres_max: maximum threshold for all nodes
def generate_historical_data_by_folder_PC(graphs_path, data_path, info_path, n, gamma_min,
                             gamma_max, thres_min, thres_max, prob_inter, epsilon, seed=3344, self_loops=True, max_anomaly=10):
    # np.random.seed(seed=seed)
    self_lag = 1
    if not os.path.exists(data_path):
        # If it doesn't exist, create the folder
        os.makedirs(data_path)
    # check the existence of data information folder
    if not os.path.exists(info_path):
        # If it doesn't exist, create the folder
        os.makedirs(info_path)

    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in tqdm(graph_files):
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        dict_nodes_thres = {}
        children_list = []
        dict_edges_lag = {}

        for edge in edges_list:
            dict_edges_lag[edge] = np.random.randint(low=gamma_min, high=gamma_max+1)
            children_list.append(edge[1])

        children_list = list(set(children_list))
        for node in nodes_list:
            dict_nodes_thres[node] = [np.round(np.random.uniform(low = thres_min, high = thres_max), 2)]

        for node in nodes_list:
            data[node] = []
            if node not in children_list:
                # data[node] = np.random.uniform(low=0, high=1, size=n)
                for i in range(n):
                    if self_loops and i-self_lag >= 0 and data[node][i-self_lag] >= dict_nodes_thres[node][0] and np.random.uniform() < 1-epsilon:
                        data[node].append(np.random.uniform(low=dict_nodes_thres[node][0], high=1))
                    else:         
                        if np.random.uniform() < 1 - prob_inter:
                            data[node].append(np.random.uniform(low=0, high=dict_nodes_thres[node][0]))
                        else:
                            data[node].append(np.random.uniform(low=dict_nodes_thres[node][0], high=1))
                    
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = np.random.uniform(low=0, high=dict_nodes_thres[node][0])  
            else:
                data[node] = np.random.uniform(low=0, high=dict_nodes_thres[node][0], size=n)

        # propagation of interventions
        # for node in topological_order:
        #     values = data[node]
        #     children = list(graph.successors(node))
        #     if len(children) != 0:
        #         for i in range(len(values)):
        #             if values[i] >= dict_nodes_thres[node]:
        #                 for child in children:
        #                     lag = dict_edges_lag[(node,child)]
        #                     if i + lag < n and np.random.uniform() < 1-epsilon:
        #                         data[child][i+lag] = np.random.uniform(low=dict_nodes_thres[child], high=1)
        #     else:
        #          continue

        for node in topological_order:
            parents = list(graph.predecessors(node))
            if len(parents) != 0:
                for i in range(len(data[node])):
                    # abnormal_parent = False
                    abnormal_parent = 0
                    for par in parents:
                        par_values = data[par]
                        lag = dict_edges_lag[(par, node)]
                        if i-lag >=0 and par_values[i-lag] >= dict_nodes_thres[par][0]:
                            # abnormal_parent = True
                            abnormal_parent += 1
                    # if abnormal_parent and np.random.uniform() < 1-epsilon:
                    #     data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                    if self_loops and i-self_lag >=0 and data[node][i-self_lag] >= dict_nodes_thres[node][0]:
                        abnormal_parent +=1
                    if abnormal_parent != 0:
                        for m in range(abnormal_parent):
                            if np.random.uniform() < 1-epsilon:
                                data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                    else:
                        if np.random.uniform() < prob_inter:
                            data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                            
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = np.random.uniform(low=0, high=dict_nodes_thres[node][0])
            else:
                 continue

        data = pd.DataFrame(data)
        # ## *****************************************************************************************************
        # ## Check data
        # print(json_file_path.split('/')[1].split('.')[0])
        # print("****Check impacts of the intervention node****")
        # print('Intervention node: ' + inter_node)
        # print('Descendants of the intervention node:' + str(list(nx.descendants(graph, inter_node))))
        # check_impact_of_intervention_node(graph=graph, inter_node=inter_node,
        #                                   data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        # if not only_inter:
        #     print("****Check impacts of the root****")
        #     root_nodes = [i for i in nodes_list if i not in children_list and i != inter_node]
        #     for root in root_nodes:
        #         print('root:' + root)
        #         check_impact_of_intervention_node(graph=graph, inter_node=root,
        #                                   data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        # ## *****************************************************************************************************
        edges_lag = {}
        for key,value in dict_edges_lag.items():
            edges_lag[str(key)] = value

        info = {'nodes_thres': dict_nodes_thres, 'edges_lag':edges_lag}
        
        last_node = topological_order[-1]
        print(np.sum(data[last_node]>=dict_node_thres[last_node][0])/2000)
        

        data.to_csv(os.path.join(data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        data_info_path = os.path.join(info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [76]:
graphs_path = 'graphs'
n = 2000 # sampling points
gamma_min = 1
gamma_max = 1
thres_min = 0.7
thres_max = 0.9
epsilon = 0.5
prob_inter = 0.1
self_loops= False
max_anomaly= 5
# data_path = os.path.join('certain', 'historical_data')
# info_path = os.path.join('certain', 'data_info')
data_path = os.path.join('uncertain_' + str(epsilon), 'historical_data')
info_path = os.path.join('uncertain_' + str(epsilon), 'data_info')
data, dict_node_thres = generate_historical_data_by_folder_PC(graphs_path=graphs_path, data_path=data_path, info_path=info_path, n=n,
                                   gamma_min=gamma_min, gamma_max=gamma_max, thres_min=thres_min,
                                   thres_max=thres_max, prob_inter=prob_inter, epsilon=epsilon, self_loops=self_loops, max_anomaly=max_anomaly)

  0%|          | 0/50 [00:00<?, ?it/s]


In [None]:
def generate_simulation_data_by_folder_PC(graphs_path, info_path, save_data_path, save_info_path, n, epsilon, only_inter=False, num_inters=1, self_loops=False, max_anomaly=5, seed=3344):
    self_lag = 1
    # np.random.seed(seed=seed)
    if not os.path.exists(save_data_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_data_path)

    if not os.path.exists(save_info_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_info_path)

    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in graph_files:
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        data_info_path = os.path.join(info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        with open(data_info_path, 'r') as json_file:
            data_info = json.load(json_file)

        dict_nodes_thres = data_info['nodes_thres']
        dict_edges_lag = data_info['edges_lag']
        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])
        if num_inters == 1:
            inter_nodes = np.random.choice(graph.nodes, size=1, replace=False)
        else:
            while True:
                inter_nodes = np.random.choice(graph.nodes, size=num_inters, replace=False)
                in_same_path = False
                for node_1 in inter_nodes:
                    for node_2 in inter_nodes:
                        if node_1 != node_2:
                            if node_1 in nx.ancestors(graph, node_2): # and node_2 not in nx.ancestors(graph, node_1)):
                                in_same_path = True
                if not in_same_path:
                    break

        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=inter_nodes)

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        children_list = []

        for edge in edges_list:
            children_list.append(edge[1])

        children_list = list(set(children_list))

 
        for node in nodes_list:
            data[node] = []
            if node in inter_nodes:
                data[node] = np.random.uniform(low=dict_nodes_thres[node][0], high=1, size=n)
                for i in range(n):
                    if (i+1)%max_anomaly == 1:
                        data[node][i] =  np.random.uniform(low=0, high=dict_nodes_thres[node][0])
            else:
                data[node] = np.random.uniform(low=0, high=dict_nodes_thres[node][0], size=n)


        # transfer interventions
        # for node in topological_order:
        #     values = data[node]
        #     children = list(graph.successors(node))
        #     if len(children) != 0:
        #         for i in range(len(values)):
        #             if values[i] >= dict_nodes_thres[node][0]:
        #                 for child in children:
        #                     lag = dict_edges_lag[str((node,child))]
        #                     if i + lag < n and np.random.uniform() < 1-epsilon:
        #                         data[child][i+lag] = np.random.uniform(low=dict_nodes_thres[child][0], high=1)
        #     else:
        #          continue
                    
        for node in topological_order:
            parents = list(graph.predecessors(node))
            if len(parents) != 0:
                for i in range(len(data[node])):
                    # abnormal_parent = False
                    abnormal_parent = 0
                    for par in parents:
                        par_values = data[par]
                        lag = dict_edges_lag[(par, node)]
                        if i-lag >=0 and par_values[i-lag] >= dict_nodes_thres[par][0]:
                            # abnormal_parent = True
                            abnormal_parent += 1
                    # if abnormal_parent and np.random.uniform() < 1-epsilon:
                    #     data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                    if self_loops and i-self_lag >=0 and data[node][i-self_lag] >= dict_nodes_thres[node][0]:
                        abnormal_parent +=1
                    if abnormal_parent != 0:
                        for m in range(abnormal_parent):
                            if np.random.uniform() < 1-epsilon:
                                data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                            
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = np.random.uniform(low=0, high=dict_nodes_thres[node][0])
            else:
                 continue

        data = pd.DataFrame(data)

        ## *****************************************************************************************************
        ## Check data
        print(json_file_path.split('/')[1].split('.')[0])
        print("****Check impacts of the intervention node****")
        for node in inter_nodes:
            print('Intervention node: ' + node)
            print('Descendants of the intervention node:' + str(list(nx.descendants(graph, node))))
            check_impact_of_intervention_node(graph=graph, inter_node=node,
                                              data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        if not only_inter:
            print("****Check impacts of the root****")
            root_nodes = [i for i in nodes_list if i not in children_list and i not in inter_nodes]
            for root in root_nodes:
                print('root:' + root)
                check_impact_of_intervention_node(graph=graph, inter_node=root,
                                          data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        ## *****************************************************************************************************

        info = {'nodes_thres': dict_nodes_thres, 'edges_lag': dict_edges_lag, 'intervention_node': list(inter_nodes)}
        
        last_node = topological_order[-1]
        print(np.sum(data[last_node]>=dict_nodes_thres[last_node][0])/n)

        data.to_csv(os.path.join(save_data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        data_info_path = os.path.join(save_info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [79]:
def generate_historical_data_by_folder_SK(graphs_path, save_data_path, save_info_path, n, thres_min=0.7, thres_max=0.9, gamma_min=1, gamma_max=1, prob_inter=0.3, epsilon=0.3, self_loops=False, max_anomaly=5):
    self_lag = 1
    # np.random.seed(seed=seed)
    if not os.path.exists(save_data_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_data_path)
    # check the existence of data information folder
    if not os.path.exists(save_info_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_info_path)

    #################################################################
    #################################################################
    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in tqdm(graph_files):
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        dict_nodes_thres = {}
        dict_nodes_base = {}
        coffes_edges = {}
        children_list = []
        dict_edges_lag = {}

        for node in nodes_list:
            dict_nodes_base[node] = [np.round(np.random.uniform(low=0, high=0.1), 2)]
            dict_nodes_thres[node] = [np.round(np.random.uniform(low=thres_min, high=thres_max), 2)]


        for edge in edges_list:
            dict_edges_lag[edge] = np.random.randint(low=gamma_min, high=gamma_max+1)
            coffes_edges[edge] = dict_nodes_thres[edge[1]][0]
            children_list.append(edge[1])

        children_list = list(set(children_list))
        
                
        for node in nodes_list:
            data[node] = []
            if node not in children_list:
                # data[node] = np.random.uniform(low=0, high=1, size=n)
                for i in range(n):
                    if self_loops and i-self_lag >= 0 and data[node][i-self_lag] >= dict_nodes_thres[node][0] and np.random.uniform() < 1-epsilon:
                        data[node].append(dict_nodes_base[node][0]+dict_nodes_thres[node][0])
                    else:         
                        if np.random.uniform() < 1 - prob_inter:
                            data[node].append(dict_nodes_base[node][0])
                        else:
                            data[node].append(np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1))
                    
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = dict_nodes_base[node][0]  
            else:
                data[node] = np.array([dict_nodes_base[node][0] for i in range(n)])

        # propagation of interventions
        for node in topological_order:
            parents = list(graph.predecessors(node))
            if len(parents) != 0:
                for i in range(len(data[node])):
                    # abnormal_parent = False
                    abnormal_parent = 0
                    for par in parents:
                        par_values = data[par]
                        lag = dict_edges_lag[(par, node)]
                        if i-lag >=0 and par_values[i-lag] >= dict_nodes_thres[par][0]:
                            # abnormal_parent = True
                            abnormal_parent += 1
                    # if abnormal_parent and np.random.uniform() < 1-epsilon:
                    #     data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                    if self_loops and i-self_lag >=0 and data[node][i-self_lag] >= dict_nodes_thres[node][0]:
                        abnormal_parent +=1
                    if abnormal_parent != 0:
                        for m in range(abnormal_parent):
                            if np.random.uniform() < 1-epsilon:
                                data[node][i] += dict_nodes_thres[node][0]
                    else:
                        if np.random.uniform() < prob_inter:
                            data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1)
                            
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = dict_nodes_base[node][0]
            else:
                 continue

        data = pd.DataFrame(data)

        edges_lag = {}
        for key,value in dict_edges_lag.items():
            edges_lag[str(key)] = value

        edges_coffe = {}
        for key,value in coffes_edges.items():
            edges_coffe[str(key)] = value

        info = {'nodes_thres': dict_nodes_thres, 'nodes_base': dict_nodes_base, 'edges_lag': edges_lag, 'edges_coffe': edges_coffe}
        
        last_node = topological_order[-1]
        print(np.sum(data[last_node]>=dict_nodes_thres[last_node][0])/n)

        data.to_csv(os.path.join(save_data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        save_data_info_path = os.path.join(save_info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(save_data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [90]:
# Generate historical data
graphs_path = 'graphs'
n = 2000 # sampling points
gamma_min = 1
gamma_max = 1
thres_min = 0.7
thres_max = 0.9
epsilon = 0.3
prob_inter = 0.05
self_loops = False
max_anomaly = 5
# data_path = os.path.join('certain_test', 'historical_data')
# info_path = os.path.join('certain_test', 'data_info')
data_path = os.path.join('SK_uncertain_test_' + str(epsilon), 'historical_data')
info_path = os.path.join('SK_uncertain_test_' + str(epsilon), 'data_info')
generate_historical_data_by_folder_SK(graphs_path=graphs_path, save_data_path=data_path, save_info_path=info_path, n=n,
                                   gamma_min=gamma_min, gamma_max=gamma_max, thres_min=thres_min,
                                   thres_max=thres_max, prob_inter=prob_inter, epsilon=epsilon,
                                   self_loops=self_loops, max_anomaly=max_anomaly)

  6%|▌         | 3/50 [00:00<00:04, 10.09it/s]

0.519
0.4225
0.4885


 10%|█         | 5/50 [00:00<00:04, 10.58it/s]

0.3205
0.3995
0.4375


 18%|█▊        | 9/50 [00:00<00:03, 10.79it/s]

0.392
0.4525
0.4245


 22%|██▏       | 11/50 [00:01<00:03, 10.60it/s]

0.4065
0.351


 26%|██▌       | 13/50 [00:01<00:03, 10.00it/s]

0.3295
0.4165
0.502


 34%|███▍      | 17/50 [00:01<00:03, 10.44it/s]

0.4825
0.348
0.5045


 38%|███▊      | 19/50 [00:01<00:02, 10.50it/s]

0.32
0.4865
0.4955


 46%|████▌     | 23/50 [00:02<00:02, 10.60it/s]

0.513
0.407
0.446


 50%|█████     | 25/50 [00:02<00:02, 10.59it/s]

0.4275
0.5805
0.3455


 58%|█████▊    | 29/50 [00:02<00:01, 10.54it/s]

0.3705
0.479
0.435


 62%|██████▏   | 31/50 [00:02<00:01, 10.51it/s]

0.471
0.3985
0.412


 70%|███████   | 35/50 [00:03<00:01, 10.33it/s]

0.496
0.402
0.491


 74%|███████▍  | 37/50 [00:03<00:01, 10.41it/s]

0.5035
0.4755
0.431


 82%|████████▏ | 41/50 [00:03<00:00, 10.55it/s]

0.5025
0.38
0.441


 86%|████████▌ | 43/50 [00:04<00:00, 10.59it/s]

0.5615
0.552
0.443


 94%|█████████▍| 47/50 [00:04<00:00, 10.71it/s]

0.4255
0.5645
0.4195


100%|██████████| 50/50 [00:04<00:00, 10.51it/s]

0.416
0.502
0.384





In [99]:
def generate_simulation_data_by_folder_SK(graphs_path, info_path, save_data_path, save_info_path, n, num_inters=1, epsilon=0.3, self_loops=False, max_anomaly=5):
    self_lag = 1
    # np.random.seed(seed=seed)
    if not os.path.exists(save_data_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_data_path)

    if not os.path.exists(save_info_path):
        # If it doesn't exist, create the folder
        os.makedirs(save_info_path)

    graph_files = [os.path.join(graphs_path, f) for f in os.listdir(graphs_path) if os.path.isfile(os.path.join(graphs_path, f))]

    for json_file_path in tqdm(graph_files):
        with open(json_file_path, 'r') as json_file:
            json_graph = json.load(json_file)

        data_info_path = os.path.join(info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        with open(data_info_path, 'r') as json_file:
            data_info = json.load(json_file)

        dict_nodes_thres = data_info['nodes_thres']
        dict_nodes_base = data_info['nodes_base']
        dict_edges_lag = data_info['edges_lag']
        dict_edges_coffe = data_info['edges_coffe']
        # Convert the loaded JSON data into a NetworkX graph
        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=[])
        if num_inters == 1:
            inter_nodes = np.random.choice(graph.nodes, size=num_inters, replace=False)
        else:
            while True:
                inter_nodes = np.random.choice(graph.nodes, size=num_inters, replace=False)
                in_same_path = False
                for node_1 in inter_nodes:
                    for node_2 in inter_nodes:
                        if node_1 != node_2:
                            if node_1 in nx.ancestors(graph, node_2): # and node_2 not in nx.ancestors(graph, node_1)):
                                in_same_path = True
                if not in_same_path:
                    break

        graph = dict_to_graph(graph_dict=json_graph, inter_nodes=inter_nodes)

        nodes_list = list(graph.nodes())
        edges_list = list(graph.edges())
        topological_order = list(nx.topological_sort(graph))

        data = {}
        children_list = []

        for edge in edges_list:
            children_list.append(edge[1])

        children_list = list(set(children_list))
        
        for node in nodes_list:
            data[node] = []
            if node in inter_nodes:
                data[node] = np.random.uniform(low=dict_nodes_thres[node][0], high=dict_nodes_thres[node][0]+1, size=n)
                for i in range(n):
                    if (i+1)%max_anomaly == 1:
                        data[node][i] =  dict_nodes_base[node][0]
            else:
                data[node] = np.array([dict_nodes_base[node][0] for i in range(n)])

        # transfer interventions
        for node in topological_order:
            parents = list(graph.predecessors(node))
            if len(parents) != 0:
                for i in range(len(data[node])):
                    # abnormal_parent = False
                    abnormal_parent = 0
                    for par in parents:
                        par_values = data[par]
                        lag = dict_edges_lag[str((par, node))]
                        if i-lag >=0 and par_values[i-lag] >= dict_nodes_thres[par][0]:
                            # abnormal_parent = True
                            abnormal_parent += 1
                    # if abnormal_parent and np.random.uniform() < 1-epsilon:
                    #     data[node][i] = np.random.uniform(low=dict_nodes_thres[node][0], high=1)
                    if self_loops and i-self_lag >=0 and data[node][i-self_lag] >= dict_nodes_thres[node][0]:
                        abnormal_parent +=1
                    if abnormal_parent != 0:
                        for m in range(abnormal_parent):
                            if np.random.uniform() < 1-epsilon:
                                data[node][i] += dict_nodes_thres[node][0]
                    if i-max_anomaly-1>=0 and np.sum(data[node][i-max_anomaly-1:i-1]>=dict_nodes_thres[node][0])==max_anomaly:
                        data[node][i] = dict_nodes_base[node][0]
            else:
                 continue

        data = pd.DataFrame(data)
        
        only_inter = True
        ## *****************************************************************************************************
        ## Check data
        print(json_file_path.split('/')[1].split('.')[0])
        print("****Check impacts of the intervention node****")
        for node in inter_nodes:
            print('Intervention node: ' + node)
            print('Descendants of the intervention node:' + str(list(nx.descendants(graph, node))))
            check_impact_of_intervention_node(graph=graph, inter_node=node,
                                              data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        if not only_inter:
            print("****Check impacts of the root****")
            root_nodes = [i for i in nodes_list if i not in children_list and i not in inter_nodes]
            for root in root_nodes:
                print('root:' + root)
                check_impact_of_intervention_node(graph=graph, inter_node=root,
                                          data=data, dict_nodes_thres=dict_nodes_thres, dict_edges_lag=dict_edges_lag, n=n)
        ## *****************************************************************************************************

        info = {'nodes_thres': dict_nodes_thres, 'nodes_base': dict_nodes_base, 'edges_lag': dict_edges_lag, 'edges_coffe': dict_edges_coffe, 'intervention_node': list(inter_nodes)}

        data.to_csv(os.path.join(save_data_path, json_file_path.split('/')[1].replace('graph', 'data').replace('json', 'csv')), index=False)

        data_info_path = os.path.join(save_info_path, json_file_path.split('/')[1].replace('graph', 'info'))
        # Save the dictionary as a JSON file
        with open(data_info_path, 'w') as json_file:
            json.dump(info, json_file)

In [103]:
graphs_path = 'graphs'
info_path = os.path.join('SK_uncertain_test_0.3', 'data_info')
n = 50 # sampling points
num_inters = 1
epsilon = 0
self_loops = False
max_anomaly = 5
# save_data_path = os.path.join('certain_', 'data')
#save_info_path = os.path.join('certain_', 'data_info')
save_data_path = os.path.join('SK_uncertain_test_0.3', 'actual_data_'+str(num_inters)+'_inters')
save_info_path = os.path.join('SK_uncertain_test_0.3', 'data_info_'+str(num_inters)+'_inters')
generate_simulation_data_by_folder_SK(graphs_path=graphs_path, info_path=info_path, save_data_path=save_data_path, save_info_path=save_info_path,
                                      n=n, num_inters=num_inters, epsilon=epsilon, self_loops=self_loops, max_anomaly=max_anomaly)

100%|██████████| 50/50 [00:00<00:00, 279.34it/s]

graph_3_2_6
****Check impacts of the intervention node****
Intervention node: c
Descendants of the intervention node:['e', 'd', 'f']
e 0
d 0
f 10
graph_2_3_6
****Check impacts of the intervention node****
Intervention node: b
Descendants of the intervention node:['e', 'f']
e 0
f 0
graph_2_2_5
****Check impacts of the intervention node****
Intervention node: f
Descendants of the intervention node:[]
graph_1_2_8
****Check impacts of the intervention node****
Intervention node: a
Descendants of the intervention node:['d', 'b', 'e', 'c', 'f']
d 0
b 0
e 0
c 0
f 0
graph_1_2_5
****Check impacts of the intervention node****
Intervention node: b
Descendants of the intervention node:['c', 'e', 'd', 'f']
c 0
e 0
d 0
f 0
graph_2_3_1
****Check impacts of the intervention node****
Intervention node: e
Descendants of the intervention node:['f']
f 0
graph_1_2_9
****Check impacts of the intervention node****
Intervention node: f
Descendants of the intervention node:[]
graph_1_2_2
****Check impacts of t


