In [1]:
import pandas as pd
import os 
import json
from tqdm import tqdm
import numpy as np
from statsmodels.stats.multitest import fdrcorrection
from scipy import stats

In [2]:
list_mechanisme = ['different_path', 'one_path']
list_process = ['E1'] #['E1', 'E2']
list_scenarios = ['certain', 'certain_SL', 'uncertain_0.3', 'uncertain_0.3_SL']

In [3]:
# graph_path = 'graphs'
gamma_max= 1
for mechanisme in list_mechanisme:
    for process in list_process:
        for scenario in list_scenarios:
            if mechanisme == 'different_path':
                actual_data_path = os.path.join(process, scenario, 'actual_data_2_inters_2000')
            elif mechanisme == 'one_path':
                actual_data_path = os.path.join(process, scenario, 'actual_data_same_path_2_inters_2000')
                
            actual_data_files = [os.path.join(actual_data_path, f) for f in os.listdir(actual_data_path) if os.path.isfile(os.path.join(actual_data_path, f))]
            
            for actual_data_file in tqdm(actual_data_files):
                if mechanisme == 'different_path':
                    historical_data = pd.read_csv(actual_data_file.replace('actual_data_2_inters_2000', 'historical_data_20000')) 
                elif mechanisme == 'one_path':
                    historical_data = pd.read_csv(actual_data_file.replace('actual_data_same_path_2_inters_2000', 'historical_data_20000'))
                actual_data = pd.read_csv(actual_data_file)
                actual_data = pd.concat([historical_data, actual_data], ignore_index=True)
                data_info = os.path.join(os.path.join(process, scenario), 'data_info_20000', actual_data_file.split('/')[-1].replace('data', 'info').replace('csv', 'json'))
                with open(data_info, 'r') as json_file:
                    data_info = json.load(json_file)
                threshold_of_data = data_info['nodes_thres']
                length_of_data = actual_data.values.shape[0]
                list_variables = actual_data.columns
                window_size = gamma_max*(len(list_variables)-1)
                log_data = {}
                log_data['case:concept:name'] = []
                log_data['observation'] = []
                log_data['time:timestamp'] = []
                case_index = 1
                for index in range(length_of_data):
                    for i in range(window_size+1):
                        if index + i < length_of_data:
                            for var in list_variables:
                                if actual_data[var][index+i] >= threshold_of_data[var]:
                                    log_data['case:concept:name'].append(case_index)
                                    log_data['observation'].append(var)
                                    log_data['time:timestamp'].append(index+i)
                    case_index+=1
                                              
                log_data = pd.DataFrame(log_data)
                log_data = log_data.sort_values(by='time:timestamp')

                if mechanisme == 'different_path':
                    output_data_path = os.path.join(process, scenario, 'LOG_actual_data_2_inters_2000')
                    if not os.path.exists(output_data_path):
                    # If it doesn't exist, create the folder
                        os.makedirs(output_data_path)  
                elif mechanisme == 'one_path':
                    output_data_path = os.path.join(process, scenario, 'LOG_actual_data_same_path_2_inters_2000')
                    if not os.path.exists(output_data_path):
                    # If it doesn't exist, create the folder
                        os.makedirs(output_data_path) 
                log_data.to_csv(os.path.join(output_data_path, actual_data_file.split('/')[-1]), index=False)
                

100%|██████████| 50/50 [03:35<00:00,  4.31s/it]
100%|██████████| 50/50 [03:58<00:00,  4.76s/it]
100%|██████████| 50/50 [03:29<00:00,  4.19s/it]
100%|██████████| 50/50 [03:48<00:00,  4.56s/it]
100%|██████████| 50/50 [03:44<00:00,  4.49s/it]
100%|██████████| 50/50 [03:54<00:00,  4.68s/it]
100%|██████████| 50/50 [03:37<00:00,  4.34s/it]
100%|██████████| 50/50 [04:07<00:00,  4.95s/it]


# Normal data generating process

In [4]:
list_mechanisme = ['EasyRCA']
list_scenarios = ['Parametric_2'] # ['Parametric', 'Structual']

In [5]:
historical_data_length = 20000
gamma_max= 1
normal_ratio = 0.9
for mechanisme in list_mechanisme:
    for scenario in list_scenarios:
        whole_data_path = os.path.join(mechanisme, scenario, 'data')
        
        whole_data_files = [os.path.join(whole_data_path, f) for f in os.listdir(whole_data_path) if os.path.isfile(os.path.join(whole_data_path, f))]

        for whole_data_file in tqdm(whole_data_files):
            whole_data = pd.read_csv(whole_data_file)
            param_data = whole_data.head(historical_data_length)
            threshold_of_data = {}
            for node in whole_data.columns:
                threshold_of_data[node] = np.sort(param_data[node])[int(normal_ratio*param_data[node].shape[0])]
            length_of_data = whole_data.values.shape[0]
            list_variables = whole_data.columns
            window_size = gamma_max*(len(list_variables)-1)
            log_data = {}
            log_data['case:concept:name'] = []
            log_data['observation'] = []
            log_data['time:timestamp'] = []
            case_index = 1
            for index in range(length_of_data):
                for i in range(window_size+1):
                    if index + i < length_of_data:
                        for var in list_variables:
                            if whole_data[var][index+i] >= threshold_of_data[var]:
                                log_data['case:concept:name'].append(case_index)
                                log_data['observation'].append(var)
                                log_data['time:timestamp'].append(index+i)
                case_index+=1

            log_data = pd.DataFrame(log_data)
            log_data = log_data.sort_values(by='time:timestamp')

            output_data_path = os.path.join(mechanisme, scenario, 'LOG')
            if not os.path.exists(output_data_path):
            # If it doesn't exist, create the folder
                os.makedirs(output_data_path)  
           
            log_data.to_csv(os.path.join(output_data_path, whole_data_file.split('/')[-1]), index=False)

100%|██████████| 50/50 [05:02<00:00,  6.05s/it]


# Dowhy data generating process

In [2]:
# Please pay attention to the name of the folder
list_mechanisme = ['Dowhy_0.5']
list_scenarios = ['different_path', 'one_path']

In [3]:
historical_data_length = 20000
gamma_max= 1
normal_ratio = 0.9
for mechanisme in list_mechanisme:
    for scenario in list_scenarios:
        whole_data_path = os.path.join(mechanisme, scenario, 'data')
        
        whole_data_files = [os.path.join(whole_data_path, f) for f in os.listdir(whole_data_path) if os.path.isfile(os.path.join(whole_data_path, f))]

        for whole_data_file in tqdm(whole_data_files):
            whole_data = pd.read_csv(whole_data_file)
            param_data = whole_data.head(historical_data_length)
            threshold_of_data = {}
            for node in whole_data.columns:
                threshold_of_data[node] = np.sort(param_data[node])[int(normal_ratio*param_data[node].shape[0])]
            length_of_data = whole_data.values.shape[0]
            list_variables = whole_data.columns
            window_size = gamma_max*(len(list_variables)-1)
            log_data = {}
            log_data['case:concept:name'] = []
            log_data['observation'] = []
            log_data['time:timestamp'] = []
            case_index = 1
            for index in range(length_of_data):
                for i in range(window_size+1):
                    if index + i < length_of_data:
                        for var in list_variables:
                            if whole_data[var][index+i] >= threshold_of_data[var]:
                                log_data['case:concept:name'].append(case_index)
                                log_data['observation'].append(var)
                                log_data['time:timestamp'].append(index+i)
                case_index+=1

            log_data = pd.DataFrame(log_data)
            log_data = log_data.sort_values(by='time:timestamp')

            output_data_path = os.path.join(mechanisme, scenario, 'LOG')
            if not os.path.exists(output_data_path):
            # If it doesn't exist, create the folder
                os.makedirs(output_data_path)  
           
            log_data.to_csv(os.path.join(output_data_path, whole_data_file.split('/')[-1]), index=False)

100%|██████████| 50/50 [15:58<00:00, 19.16s/it]
100%|██████████| 50/50 [18:34<00:00, 22.30s/it]


# Monitoring data

In [15]:
import glob
import json

simplify_node_name = {
    'Real time merger bolt de not_found sur Storm-1': 'Real time merger bolt',
    'Check message bolt de not_found sur storm-1': 'Check message bolt',
    'Message dispatcher bolt de not_found sur storm-1': 'Message dispatcher bolt',
    'Metric bolt de not_found sur Storm-1': 'Metric bolt',
    'Pre-Message dispatcher bolt de not_found sur storm-1': 'Pre-Message dispatcher bolt',
    'capacity_last_metric_bolt de Apache-Storm-bolt_capacity_topology - monitoring_ingestion sur prd-ovh-storm-01': 'Last_metric_bolt',
    'capacity_elastic_search_bolt de Apache-Storm-bolt_capacity_topology - monitoring_ingestion sur prd-ovh-storm-01': 'Elastic_search_bolt',
    'Group status information bolt de not_found sur storm-1': 'Group status information bolt'
}

column_name_transfer = {'capacity_last_metric_bolt': 'Last_metric_bolt',
                        'capacity_elastic_search_bolt': 'Elastic_search_bolt',
                        'pre_Message_dispatcher_bolt': 'Pre-Message dispatcher bolt',
                        'check_message_bolt': 'Check message bolt',
                        'message_dispatcher_bolt': 'Message dispatcher bolt',
                        'metric_bolt': 'Metric bolt',
                        'group_status_information_bolt': 'Group status information bolt',
                        'Real_time_merger_bolt': 'Real time merger bolt'
}

true_root_causes = ['Elastic_search_bolt', 'Pre-Message dispatcher bolt']

boolean_variables = []
whole_data = pd.DataFrame()
dict_anomaly = pd.DataFrame()
directoryPath = '../real_monitoring_data/'

for file_name in glob.glob(directoryPath + '*.csv'):
    if "data_with_incident_between_46683_and_46783" not in file_name:
        col_value = pd.read_csv(file_name, low_memory=False)
        with open(file_name.replace('.csv', '.json')) as json_file:
            x_descri = json.load(json_file)
        whole_data[simplify_node_name[x_descri["metric_name"]]] = col_value['value']
        dict_anomaly[simplify_node_name[x_descri["metric_name"]]] = x_descri["anomalies"]

In [16]:
anomaly_start = 46683
anomaly_end = 46783
normal_ratio = 0.9

param_data = whole_data.head(anomaly_start)

threshold_of_data = {}
for node in param_data.columns:
    threshold_of_data[node] = np.sort(param_data[node])[int(normal_ratio*param_data[node].shape[0])]
length_of_data = whole_data.values.shape[0]
list_variables = whole_data.columns
window_size = gamma_max*(len(list_variables)-1)
log_data = {}
log_data['case:concept:name'] = []
log_data['observation'] = []
log_data['time:timestamp'] = []
case_index = 1
for index in range(length_of_data):
    for i in range(window_size+1):
        if index + i < length_of_data:
            for var in list_variables:
                if whole_data[var][index+i] >= threshold_of_data[var]:
                    log_data['case:concept:name'].append(case_index)
                    log_data['observation'].append(var)
                    log_data['time:timestamp'].append(index+i)
    case_index+=1

log_data = pd.DataFrame(log_data)
log_data = log_data.sort_values(by='time:timestamp')
 
log_data.to_csv('../monitring_data.csv', index=False)

In [17]:
log_data

Unnamed: 0,case:concept:name,observation,time:timestamp
0,4,Real time merger bolt,10
21,10,Real time merger bolt,10
15,9,Real time merger bolt,10
10,8,Real time merger bolt,10
6,7,Real time merger bolt,10
...,...,...,...
652633,70585,Real time merger bolt,70587
652638,70587,Real time merger bolt,70587
652624,70583,Real time merger bolt,70587
652636,70586,Real time merger bolt,70587


In [1]:
list_mechanisme = ['different_path'] # ['different_path', 'one_path']
list_process = ['E1'] #['E1', 'E2']
list_scenarios = ['certain'] # ['certain', 'certain_SL', 'uncertain_0.3', 'uncertain_0.3_SL']

In [18]:
import os 
import sys 
sys.path.append('/home/lzan/Bureau/Dynamic causal graph/root-cause-analysis/RAITIA')

import pandas as pd

from baseline.AITIA.Inference import Inference

list_variables = ['a', 'b', 'c', 'd', 'e', 'f']


for mechanisme in list_mechanisme:
    for process in list_process:
        for scenario in list_scenarios:
            if mechanisme == 'different_path':
                actual_data_path = os.path.join(process, scenario, 'LOG_actual_data_2_inters_2000')
            elif mechanisme == 'one_path':
                actual_data_path = os.path.join(process, scenario, 'LOG_actual_data_same_path_2_inters_2000')
                

set_log_files = [os.path.join(actual_data_path, f) for f in os.listdir(actual_data_path) if os.path.isfile(os.path.join(actual_data_path, f))]

for log_file in set_log_files[1:2]:
    inference = Inference(log_file, pb = False)
    inference.generate_hypotheses_for_effects(causes = inference.alphabet, effects = inference.alphabet)
    inference.test_for_prima_facie()
    all_epsilon_averages = inference.calculate_average_epsilons()
    print(all_epsilon_averages)

other causes
['b', 'f']
other causes
['a', 'f']
other causes
['a', 'b']
other causes
['c', 'f']
other causes
['a', 'f']
other causes
['a', 'c']
other causes
['c', 'b']
other causes
['a', 'b']
other causes
['a', 'c']
other causes
['b']
other causes
['a']


In [19]:
all_epsilon_averages

{('a', 'c'): 0.41081934610908344,
 ('b', 'c'): 0.5230329868658357,
 ('f', 'c'): 0.6998298969598098,
 ('a', 'b'): 0.6046461519225521,
 ('c', 'b'): 0.6169633626298179,
 ('f', 'b'): 0.6141794203613968,
 ('a', 'f'): 0.41682059810890904,
 ('c', 'f'): 0.6895196111808372,
 ('b', 'f'): 0.5399877626544682,
 ('a', 'd'): -0.06406307830806346,
 ('b', 'd'): 0.7565368676206115}

In [23]:


alpha = 0.5

if len(all_epsilon_averages.keys())==0:
    pref_root_cause = []
else:
    list_effect = []
    pred_root_cause = []


    for edge in all_epsilon_averages.keys():
        list_effect.append(edge[1])

    list_effect = list(set(list_effect))

    for effect in list_effect:
        list_edges = []
        list_epsilons = []
        for edge in all_epsilon_averages.keys():
            if edge[1] == effect:
                list_edges.append(edge)
                list_epsilons.append(all_epsilon_averages[edge])

        list_epsilons = np.array(list_epsilons)
        if np.std(list_epsilons) == 0:
            z_scores = (list_epsilons-np.mean(list_epsilons))
        else:
            z_scores = (list_epsilons-np.mean(list_epsilons))/np.std(list_epsilons)
        p_values = stats.norm.sf(abs(z_scores))*2

        res, p_values_adapt = fdrcorrection(p_values, alpha=alpha)
        for i in np.where(res==True)[0]:
            pred_root_cause.append(list_edges[i][0])

In [27]:
pred_root_cause = list(set(pred_root_cause))
print(pred_root_cause)

['a', 'b', 'c', 'f']


In [1]:
from rpy2.robjects import numpy2ri, r, FloatVector
from rpy2.robjects.packages import importr


# Convert numpy arrays to R objects
numpy2ri.activate()

# Install and load the fdrtool package in R
# r('install.packages("fdrtool")')
fdrtool = importr('fdrtool')

In [2]:
all_epsilon_averages = {('a_t_1>0.8', 'a_t'): -0.006353078676086981, ('b_t_1>0.81', 'a_t'): -0.010453027046809268, ('d_t_1>0.77', 'a_t'): -0.0002159425228454498, ('c_t_1>0.75', 'a_t'): 0.0016278590484477967, ('e_t_1>0.78', 'a_t'): -0.0031319646113668496, ('f_t_1>0.88', 'a_t'): -0.00436052863915595, ('a_t_1>0.8', 'b_t'): 0.9035687319786583, ('b_t_1>0.81', 'b_t'): -0.009449299849309672, ('d_t_1>0.77', 'b_t'): -0.003827818653238041, ('c_t_1>0.75', 'b_t'): -0.012339257600468001, ('e_t_1>0.78', 'b_t'): 0.003685227805564789, ('f_t_1>0.88', 'b_t'): 0.011209264333918296, ('a_t_1>0.8', 'd_t'): 0.9038343715711326, ('b_t_1>0.81', 'd_t'): -0.010179292095904468, ('d_t_1>0.77', 'd_t'): -0.009286167270716938, ('c_t_1>0.75', 'd_t'): -0.014733574730934462, ('e_t_1>0.78', 'd_t'): -0.005826854997717101, ('f_t_1>0.88', 'd_t'): 0.002212161556768499, ('a_t_1>0.8', 'c_t'): -0.001847436841506489, ('b_t_1>0.81', 'c_t'): 0.8985670333521618, ('d_t_1>0.77', 'c_t'): 0.34772616465863726, ('c_t_1>0.75', 'c_t'): -0.004775653258291346, ('e_t_1>0.78', 'c_t'): -0.005506604862541309, ('f_t_1>0.88', 'c_t'): -0.002307756650838366, ('a_t_1>0.8', 'e_t'): 0.006348340122962437, ('b_t_1>0.81', 'e_t'): -0.0029780998005594084, ('d_t_1>0.77', 'e_t'): 0.002250651184571473, ('c_t_1>0.75', 'e_t'): 0.8933357071631354, ('e_t_1>0.78', 'e_t'): -0.0021874331346052656, ('f_t_1>0.88', 'e_t'): -0.005919179851838652, ('a_t_1>0.8', 'f_t'): -0.010802722868314796, ('b_t_1>0.81', 'f_t'): -0.0175974070188879, ('d_t_1>0.77', 'f_t'): -0.0035348304456794266, ('c_t_1>0.75', 'f_t'): 0.8946535481297984, ('e_t_1>0.78', 'f_t'): -0.007463291774489245, ('f_t_1>0.88', 'f_t'): -0.004502987831500072}

In [19]:
list_effect = []
pred_root_cause = []

for edge in all_epsilon_averages.keys():
    list_effect.append(edge[1])
    
list_effect = list(set(list_effect))

causes = []
for effect in list_effect:
    list_edges = []
    list_epsilons = []
    for edge in all_epsilon_averages.keys():
        if edge[1] == effect:
            list_edges.append(edge)
            list_epsilons.append(all_epsilon_averages[edge])
    
    list_epsilons = np.array(list_epsilons)
    z_scores = (list_epsilons-np.mean(list_epsilons))/np.std(list_epsilons)
    p_values = stats.norm.sf(abs(z_scores))*2

    res, p_values_adapt = fdrcorrection(p_values, alpha=alpha)
    for i in np.where(res==True)[0]:
        pred_root_cause.append(list_edges[i][0].split('_')[0])
pred_root_cause = list(set(pred_root_cause))

In [20]:
pred_root_cause

[]

In [16]:
list_edges

[('a_t_1>0.8', 'c_t'),
 ('b_t_1>0.81', 'c_t'),
 ('d_t_1>0.77', 'c_t'),
 ('c_t_1>0.75', 'c_t'),
 ('e_t_1>0.78', 'c_t'),
 ('f_t_1>0.88', 'c_t')]