In [1]:
import sys

%cd ..
# add the src directory for the code
sys.path.append('src')

%cd .
sys.path.append('cdml-neurips2020/datasets/complexity_10')

/home/nikolas/Downloads/CS-673 (23-24)/Project/msft_causica
/home/nikolas/Downloads/CS-673 (23-24)/Project/msft_causica


In [2]:
import os
import warnings
import fsspec
import json
import networkx as nx
import numpy as np
import pandas as pd
import torch
import pickle
import cdt

from causica.lightning.modules.deci_module import DECIModule
from tigramite.data_processing import DataFrame
from tigramite.pcmci import PCMCI
from tigramite.independence_tests.parcorr import ParCorr

warnings.filterwarnings("ignore")

Detecting 1 CUDA device(s).


In [3]:
# Function to compute precision, recall, and F1-score for edges
def edge_metrics(true_graph, predicted_graph):
    true_edges = set(true_graph.edges())
    predicted_edges = set(predicted_graph.edges())

    true_positives = len(true_edges & predicted_edges)
    false_positives = len(predicted_edges - true_edges)
    false_negatives = len(true_edges - predicted_edges)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def load_deci(root_path, name):
    trained_name = name + 'deci_learned_SEM.pt'
    model_path = os.path.join(root_path, trained_name)
    sem_module = torch.load(model_path)
    return sem_module

def run_pcmci(df,filename):
    data_array = df.values
    tigramite_df = DataFrame(data_array, var_names=df.columns)

    # Extract max_lag and min_lag from the filename
    max_lag = int(filename.split('Max Lag ')[1].split(' Min Lag')[0])
    min_lag = int(filename.split('Min Lag ')[1].split(' Target Max Parents')[0])
    print(f'Max Lag: {max_lag}, Min Lag: {min_lag}')

    # Run PCMCI
    parcorr = ParCorr(significance='analytic')
    pcmci = PCMCI(dataframe=tigramite_df, cond_ind_test=parcorr, verbosity=1)
    results = pcmci.run_pcmci(tau_min=min_lag, tau_max=max_lag, pc_alpha=0.05, link_assumptions=None)
    p_matrix = results['p_matrix']
    q_matrix = pcmci.get_corrected_pvalues(p_matrix=p_matrix, tau_max=max_lag, fdr_method='fdr_bh')

    # Create a filtered PCMCI graph based on significant links
    def create_filtered_pcmci_graph(var_names, link_matrix, p_matrix, alpha_level=0.05, threshold=0.2):
        pcmci_graph = nx.DiGraph()
        max_lag = p_matrix.shape[2] - 1  # tau_max is equal to max_lag

        for i in range(link_matrix.shape[0]):
            for j in range(link_matrix.shape[1]):
                if i != j:
                    for lag in range(1, max_lag + 1):
                        if p_matrix[i, j, lag] < alpha_level and link_matrix[i, j, lag] >= threshold:
                            source_node = f"{var_names[i]}:{lag}"
                            target_node = f"{var_names[j]}"
                            pcmci_graph.add_edge(source_node, target_node, label=f"lag {lag}")
                        pcmci_graph.add_node(f"{var_names[j]}")
                        pcmci_graph.add_node(f"{var_names[i]}:{lag}")

        return pcmci_graph

    link_matrix = results['val_matrix']
    pcmci_graph = create_filtered_pcmci_graph(tigramite_df.var_names, link_matrix, p_matrix, alpha_level=0.05, threshold=0.1)

    return pcmci_graph

def evaluate_model(graph, gt_graph, extra=True):
    shd = cdt.metrics.SHD(gt_graph, graph, double_for_anticausal=True)
    
    if extra == True:
        sid = cdt.metrics.SID(graph, gt_graph) #depends on the R-package SID - comment if not installed
    else: sid = 0.

    precision, recall, f1_score = edge_metrics(gt_graph, graph)
    return shd, sid, precision, recall, f1_score

In [5]:
root_path = '../cdml-neurips2020/datasets/'
complexities = ['complexity_0', 'complexity_10', 'complexity_30']

# File paths for DECI and PCMCI evaluation metrics
deci_metrics_file = os.path.join(root_path, 'deci_evaluation_metrics.csv')
pcmci_metrics_file = os.path.join(root_path, 'pcmci_evaluation_metrics.csv')

# Initialize metrics files
with open(deci_metrics_file, 'w') as f:
    f.write('Dataset,SHD,SID,Precision,Recall,F1-Score\n')
with open(pcmci_metrics_file, 'w') as f:
    f.write('Dataset,SHD,SID,Precision,Recall,F1-Score\n')

for complexity in complexities:
    dataset_path = os.path.join(root_path, complexity)
    for filename in os.listdir(dataset_path):
        if filename.endswith('-lagged.csv'):

            name_pcmci = filename.replace('-lagged.csv', '.csv')
            df_pcmci = pd.read_csv(os.path.join(dataset_path, name_pcmci))
            
            df = pd.read_csv(os.path.join(dataset_path, filename))
            name = filename.replace('-lagged.csv', '')
            variables_spec = []
            for col in df.columns:
                variables_spec.append({"name": col, "type": "continuous", "group_name": col})
            
            variables_path = os.path.join(dataset_path, filename.replace('.csv', '.json'))
            with fsspec.open(variables_path, mode="w", encoding="utf-8") as f:
                json.dump({"variables": variables_spec}, f, indent=2)

            with fsspec.open(variables_path, mode="r", encoding="utf-8") as f:
                variables_spec = json.load(f)["variables"]
            
            # Generate constraint matrix
            node_names = df.columns.tolist()
            num_nodes = len(node_names)
            max_lag = max([int(node.split(":")[1]) for node in node_names if ":" in node])
            constraint_matrix = np.full((num_nodes, num_nodes), np.nan, dtype=np.float32)
            node_name_to_idx = {key: i for i, key in enumerate(node_names)}
            
            for node in node_names:
                for node_2 in node_names:
                    suffix = node.split(":")[1] if ":" in node else None
                    suffix_2 = node_2.split(":")[1] if ":" in node_2 else None
                    if suffix is not None and suffix_2 is not None and suffix == suffix_2:
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0
                        constraint_matrix[node_name_to_idx[node_2], node_name_to_idx[node]] = 0
                    if ":" not in node:
                        constraint_matrix[node_name_to_idx[node], :] = 0
                    prefix_2 = node_2.split(":")[0]
                    if node == node_2:
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0
                    if prefix_2 == node:
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0
                    if ":" in node_2 and node == node_2.split(":")[1]:
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0
                        constraint_matrix[node_name_to_idx[node_2], node_name_to_idx[node]] = 0
                    if ":" in node and ":" in node_2 and int(node.split(":")[1]) < int(node_2.split(":")[1]):
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0
                    if ":" not in node and ":" in node_2:
                        constraint_matrix[node_name_to_idx[node], node_name_to_idx[node_2]] = 0            

            # Load DECI
            sem_module = load_deci(dataset_path, name)
            # Create a structural equation model using the most likely graph
            sem = sem_module().mode
            # Run PCMCI
            pcmci_graph = run_pcmci(df_pcmci, name_pcmci)

            # Load and clean ground truth graph
            graph_filename = os.path.join(dataset_path, filename.split('-')[0] + '-causal_graph.pkl')
            with open(graph_filename, 'rb') as f:
                gt_graph = pickle.load(f)

            for node in list(gt_graph.nodes):
                if node.startswith('S'):
                    gt_graph.remove_node(node)
            for edge in list(gt_graph.edges):
                if edge[0].startswith('S') or edge[1].startswith('S'):
                    gt_graph.remove_edge(edge[0], edge[1])
            for node in list(gt_graph.nodes):
                new_name = node.replace('_t', '').replace('_', '_').replace(':1', ':1').replace('-', ':')
                nx.relabel_nodes(gt_graph, {node: new_name}, copy=False)

            # Evaluate DECI model
            deci_graph = nx.from_numpy_array(sem.graph.cpu().numpy(), create_using=nx.DiGraph)
            deci_graph = nx.relabel_nodes(deci_graph, {i: key for i, key in enumerate(df.columns)})
            deci_shd, deci_sid, deci_precision, deci_recall, deci_f1_score = evaluate_model(deci_graph, gt_graph)
            print(f'DECI - Dataset: {name}, SHD: {deci_shd}, SID: {deci_sid}, Precision: {deci_precision}, Recall: {deci_recall}, F1-Score: {deci_f1_score}')
            with open(deci_metrics_file, 'a') as f:
                f.write(f'{name},{deci_shd},{deci_sid},{deci_precision},{deci_recall},{deci_f1_score}\n')

            # Evaluate PCMCI model
            pcmci_shd, pcmci_sid, pcmci_precision, pcmci_recall, pcmci_f1_score = evaluate_model(pcmci_graph, gt_graph)
            print(f'PCMCI - Dataset: {name}, SHD: {pcmci_shd}, SID: {pcmci_sid}, Precision: {pcmci_precision}, Recall: {pcmci_recall}, F1-Score: {pcmci_f1_score}')
            with open(pcmci_metrics_file, 'a') as f:
                f.write(f'{name},{pcmci_shd},{pcmci_sid},{pcmci_precision},{pcmci_recall},{pcmci_f1_score}\n')


Max Lag: 1, Min Lag: 1

##
## Step 1: PC1 algorithm for selecting lagged conditions
##

Parameters:
independence test = par_corr
tau_min = 1
tau_max = 1
pc_alpha = [0.05]
max_conds_dim = None
max_combinations = 1



## Resulting lagged parent (super)sets:

    Variable X1 has 1 link(s):
        (X2 -1): max_pval = 0.00000, |min_val| =  0.457

    Variable X2 has 3 link(s):
        (X1 -1): max_pval = 0.00650, |min_val| =  0.275
        (X3 -1): max_pval = 0.00963, |min_val| =  0.263
        (X2 -1): max_pval = 0.02043, |min_val| =  0.234

    Variable X3 has 1 link(s):
        (X2 -1): max_pval = 0.00000, |min_val| =  0.495

    Variable Y1 has 1 link(s):
        (X2 -1): max_pval = 0.02566, |min_val| =  0.227

##
## Step 2: MCI algorithm
##

Parameters:

independence test = par_corr
tau_min = 1
tau_max = 1
max_conds_py = None
max_conds_px = None

## Significant links at alpha = 0.05:

    Variable X1 has 1 link(s):
        (X2 -1): pval = 0.00002 | val =  0.423

    Variable X2 has 1 