In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from random import choice, randint, uniform
from scipy.stats import norm

In [2]:

# 参数定义
num_dags = 1000  # DAG数量
num_nodes_range = (5, 15)  # 每个DAG节点数量范围
sample_sizes = [100, 200, 500]  # 数据集规模
noise_levels = [0.1, 0.5, 1.0]  # 噪声强度

# 存储结果
dag_data = []

# 生成随机DAG，满足马尔可夫条件和忠实性
for i in range(num_dags):
    # 生成随机节点数量的DAG
    num_nodes = randint(*num_nodes_range)
    G = nx.DiGraph()
    G.add_nodes_from(range(num_nodes))
    
    # 随机生成有向边，确保DAG无环
    while not nx.is_directed_acyclic_graph(G):
        G.clear_edges()
        for n in range(num_nodes):
            for m in range(n+1, num_nodes):
                if np.random.rand() > 0.7:  # 30%几率添加边
                    G.add_edge(n, m)

    # 马尔可夫条件和忠实性：生成符合条件的高斯噪声
    adj_matrix = nx.adjacency_matrix(G).todense()
    coef_matrix = adj_matrix * np.random.uniform(0.1, 1.0, adj_matrix.shape)  # 随机系数
    
    # 选择干预和结果变量
    intervention_var = choice(list(G.nodes))
    outcome_var = choice([n for n in G.nodes if n != intervention_var])

    # 识别后门调整集（Markov blanket）
    backdoor_set = set()
    for node in G.predecessors(outcome_var):
        if node != intervention_var:
            backdoor_set.add(node)
    for node in G.successors(intervention_var):
        if node != outcome_var:
            backdoor_set.add(node)

    # 生成不同规模的数据及噪声
    for sample_size in sample_sizes:
        for noise_level in noise_levels:
            data = np.zeros((sample_size, num_nodes))
            for t in range(sample_size):
                # 对每个节点生成值，满足高斯噪声
                for node in range(num_nodes):
                    noise = np.random.normal(0, noise_level)
                    parent_values = sum(coef_matrix[parent, node] * data[t, parent] for parent in G.predecessors(node))
                    data[t, node] = parent_values + noise
            
            # 存储DAG数据及因果效应
            ace = coef_matrix[intervention_var, outcome_var]  # 直接因果效应（ACE）
            dag_data.append({
                'dag_id': i,
                'sample_size': sample_size,
                'noise_level': noise_level,
                'intervention_var': intervention_var,
                'outcome_var': outcome_var,
                'backdoor_set': list(backdoor_set),
                'ace': ace,
                'data': pd.DataFrame(data, columns=[f'X{j}' for j in range(num_nodes)])
            })


In [3]:
# 示例：打印生成的第一个DAG的结果
example_dag = dag_data[0]
print("DAG ID:", example_dag['dag_id'])
print("Sample Size:", example_dag['sample_size'])
print("Noise Level:", example_dag['noise_level'])
print("Intervention Variable:", example_dag['intervention_var'])
print("Outcome Variable:", example_dag['outcome_var'])
print("Backdoor Adjustment Set:", example_dag['backdoor_set'])
print("ACE:", example_dag['ace'])
print("Sample Data:\n", example_dag['data'].head())

DAG ID: 0
Sample Size: 100
Noise Level: 0.1
Intervention Variable: 6
Outcome Variable: 10
Backdoor Adjustment Set: []
ACE: 0.0
Sample Data:
          X0        X1        X2        X3        X4        X5        X6  \
0 -0.043582 -0.042715  0.097131  0.037680 -0.026303 -0.014756  0.034599   
1 -0.050680 -0.112951  0.306848 -0.236150 -0.085010 -0.001908  0.096936   
2 -0.057246  0.062035  0.010773 -0.037877 -0.048808  0.119706  0.010308   
3 -0.051077  0.053778  0.004720  0.144223 -0.033036  0.063840  0.126932   
4 -0.032059  0.199118  0.017806 -0.052781 -0.099160 -0.017898 -0.011370   

         X7        X8        X9       X10       X11       X12  
0  0.052560  0.173823 -0.077685 -0.010926 -0.096044 -0.079609  
1  0.189888 -0.126167  0.109164 -0.005447  0.065146 -0.062666  
2  0.185165 -0.084474  0.183712  0.028199  0.041580 -0.082201  
3  0.047313  0.102544  0.186686 -0.064140  0.031517  0.088366  
4 -0.074721 -0.051598 -0.014643  0.095125  0.039419  0.107377  


In [5]:
# 示例：打印生成的第一个DAG的结果
example_dag = dag_data[100]
print("DAG ID:", example_dag['dag_id'])
print("Sample Size:", example_dag['sample_size'])
print("Noise Level:", example_dag['noise_level'])
print("Intervention Variable:", example_dag['intervention_var'])
print("Outcome Variable:", example_dag['outcome_var'])
print("Backdoor Adjustment Set:", example_dag['backdoor_set'])
print("ACE:", example_dag['ace'])
print("Sample Data:\n", example_dag['data'].head())

DAG ID: 11
Sample Size: 100
Noise Level: 0.5
Intervention Variable: 1
Outcome Variable: 7
Backdoor Adjustment Set: []
ACE: 0.0
Sample Data:
          X0        X1        X2        X3        X4        X5        X6  \
0  0.223341 -0.290767 -0.232290 -0.094354  1.285832  0.499786 -0.145607   
1  0.286766  0.230805 -0.097191 -0.405695  0.555624  0.091132  0.508244   
2  0.200687  0.138359  0.256646 -0.683888 -0.392073 -0.042485  0.347206   
3 -0.563410  0.000187  0.120081  0.624541 -0.180089  0.173625  0.181209   
4  0.136099 -0.163362  0.401675 -0.508813  0.552497  0.437060  0.238138   

         X7  
0  0.319752  
1 -0.126090  
2  0.109672  
3 -0.386218  
4 -0.666634  
