到了这个版本，已经把ACE和找最简的后门路径调整集这两步都已经完成了。下一步要做的：
1. 确认生成数据的细节
2. 更改循环方式，确保生成的每一张图都有非0的ACE
3. 然后用数据验证RCEE
4. 定义几个指标：


In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from random import choice, randint, uniform
from itertools import chain
from statsmodels.formula.api import logit
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression

In [2]:
# 识别后门调整集
def find_backdoor_adjustment_set(G, X, Y):
    # 找到所有共同祖先节点
    ancestors_of_X = nx.ancestors(G, X)
    ancestors_of_Y = nx.ancestors(G, Y)
    common_ancestors = ancestors_of_X.intersection(ancestors_of_Y)
        
    # 存储所有后门路径的节点集合
    backdoor_paths_nodes = []

    # 遍历每个共同祖先节点
    for ancestor in common_ancestors:
        # 找到从公共祖先到X的路径
        paths_to_X = list(nx.all_simple_paths(G, source=ancestor, target=X))
        # 找到从公共祖先到Y的路径
        paths_to_Y = list(nx.all_simple_paths(G, source=ancestor, target=Y))

        # 将从公共祖先到X的路径和从公共祖先到Y的路径组合成完整的后门路径
        for path_X in paths_to_X:
            for path_Y in paths_to_Y:
            # 从路径中去掉公共祖先和终点X、Y，防止重复
                full_path = set(path_X[:-1] + path_Y[1:-1])
                # 添加路径到后门路径集合
                if full_path:
                    backdoor_paths_nodes.append(full_path)
        
    # 找到覆盖所有路径的最小集合
    return minimum_cover(backdoor_paths_nodes)

# 求最小覆盖集
def minimum_cover(sets):
    # 展开集合中的所有元素
    elements = set(chain(*sets))
    cover = set()
    
    while sets:
        # 找到最常出现的元素
        most_common = max(elements, key=lambda e: sum(1 for s in sets if e in s))
        cover.add(most_common)
            
        # 移除包含该元素的所有集合
        sets = [s for s in sets if most_common not in s]
        elements.discard(most_common)
        
    return list(cover)

In [3]:
# 计算从干预变量到结果变量的总体因果效应（ACE）
def calculate_ace(G, adj_matrix, X, Y):
    paths = list(nx.all_simple_paths(G, source=X, target=Y))
    ace = 0
    for path in paths:
        # 串联：每条路径上的系数相乘
        path_weight = np.prod([adj_matrix[path[i], path[i+1]] for i in range(len(path) - 1)])
        # 并联：每条路径的效应相加
        ace += path_weight
    return ace

In [4]:
# 参数定义
num_dags = 1  # DAG数量
num_nodes_range = (5, 15)  # 每个DAG节点数量范围
# sample_sizes = [100, 200, 500]  # 数据集规模
sample_sizes = [100000]
noise_levels = [0.1, 0.5, 1.0]  # 噪声强度
# noise_levels = [0]

# 存储结果
dag_data = []

# 生成随机DAG，满足马尔可夫条件和忠实性
for i in range(num_dags):
    # 生成随机节点数量的DAG
    num_nodes = randint(*num_nodes_range)
    G = nx.DiGraph()
    G.add_nodes_from(range(num_nodes))
    
    # 创建邻接矩阵，并随机生成有向边
    adj_matrix = np.zeros((num_nodes, num_nodes))
    for u in range(num_nodes):
        for v in range(u + 1, num_nodes):
            if np.random.rand() > 0.7:  # 30%几率添加边
                weight = uniform(0.1, 1.0)
                adj_matrix[u, v] = weight
                G.add_edge(u, v, weight=weight)

    # 打印邻接矩阵
    print(f"\nDAG {i} Adjacency Matrix with Weights:\n", adj_matrix)

    # 选择干预和结果变量
    intervention_var = choice(list(G.nodes))
    outcome_var = choice([n for n in G.nodes if n != intervention_var])

    # 识别后门调整集
    backdoor_set = find_backdoor_adjustment_set(G, intervention_var, outcome_var)
    print(f"Backdoor Adjustment Set for DAG {i} from {intervention_var} to {outcome_var}:", backdoor_set)

    # 计算从干预变量到结果变量的总体因果效应（ACE）
    ace = calculate_ace(G, adj_matrix, intervention_var, outcome_var)
    print(f"ACE for DAG {i} from {intervention_var} to {outcome_var}:", ace)

    # 生成不同规模的数据及噪声
    for sample_size in sample_sizes:
        for noise_level in noise_levels:
            data = np.zeros((sample_size, num_nodes))
            for t in range(sample_size):
                # 为拓扑排序中没有父节点的节点添加初始值，初始值为[-10, 10]之间的随机数
                # print("拓扑排序：", list(nx.topological_sort(G)))
                for node in nx.topological_sort(G):
                    # print(node, G.predecessors(node))
                    if not list(G.predecessors(node)):
                        data[t, node] = np.random.uniform(-10, 10) 
                # print("data:", data)
                # 初始化根节点的值
                for node in nx.topological_sort(G): # 拓扑排序
                    if list(G.predecessors(node)):
                        noise = np.random.normal(0, noise_level) # 添加高斯噪声
                        parent_values = sum(adj_matrix[parent, node] * data[t, parent] for parent in G.predecessors(node))
                        data[t, node] = parent_values + noise
            # print("data:", data)
            # 存储DAG数据及因果效应
            dag_data.append({
                'dag_id': i,
                'sample_size': sample_size,
                'noise_level': noise_level,
                'weight_matrix': adj_matrix,
                'intervention_var': intervention_var,
                'outcome_var': outcome_var,
                'backdoor_set': backdoor_set,
                'ace': ace,
                'data': pd.DataFrame(data, columns=[f'X{j}' for j in range(num_nodes)]),
                'estimated_ace': None,
                'lower_bound_of_confidence_interval': None,
                'upper_bound_of_confidence_interval': None
            })
            # print(f"Data for DAG {i} with {sample_size} samples and noise level {noise_level}:\n", dag_data[-1]['data'])


DAG 0 Adjacency Matrix with Weights:
 [[0.         0.         0.         0.23891407 0.         0.13522723
  0.9528282  0.         0.         0.38467517]
 [0.         0.         0.61022704 0.         0.         0.57480442
  0.81695265 0.97543744 0.14806087 0.37138077]
 [0.         0.         0.         0.         0.         0.
  0.         0.39286811 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.51275003 0.89131733 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.5570435  0.42649433 0.9698469  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.34375191]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.   

In [5]:
# 示例：打印生成的第一个DAG的结果
example_dag = dag_data[0]
print("DAG ID:", example_dag['dag_id'])
print("Sample Size:", example_dag['sample_size'])
print("Noise Level:", example_dag['noise_level'])
print("Intervention Variable:", example_dag['intervention_var'])
print("Outcome Variable:", example_dag['outcome_var'])
print("Backdoor Adjustment Set:", example_dag['backdoor_set'])
print("ACE (Total Effect):", example_dag['ace'])
print("Sample Data:\n", example_dag['data'].head())


DAG ID: 0
Sample Size: 100000
Noise Level: 0.1
Intervention Variable: 1
Outcome Variable: 6
Backdoor Adjustment Set: []
ACE (Total Effect): 1.137143713539559
Sample Data:
          X0        X1        X2        X3        X4        X5         X6  \
0  9.775649  7.606990  4.528001  2.396490  4.175005  5.648451  18.487992   
1  0.255306  9.046579  5.379245  0.044222  8.072053  5.193767  10.502860   
2 -6.208228 -1.770173 -1.265993 -1.350799  1.397919 -1.784887  -8.395440   
3 -8.336175  5.453811  3.355196 -1.826548  5.149575  2.038419  -2.404294   
4  2.599526 -6.087847 -3.497500  0.555092  5.034781 -3.087120  -4.143839   

          X7         X8         X9  
0  13.698442  10.297077  11.358580  
1  17.042905  13.548670   9.204909  
2  -2.311982  -0.876596  -3.679033  
3  10.164900   7.524098   2.222232  
4  -6.069728   0.517437  -3.275462  


In [6]:
# def check(data):
    

In [7]:
if example_dag['backdoor_set']:
    target = "X" + str(example_dag['outcome_var']) + " ~ " + "X" + str(example_dag['intervention_var']) + " + " + " + ".join([f"X{node}" for node in example_dag['backdoor_set']])
    print(target)
else:
    target = "X" + str(example_dag['outcome_var']) + " ~ " + "X" + str(example_dag['intervention_var'])
    print(target)

X6 ~ X1


In [8]:
# # if example_dag['backdoor_set']:
# #     # 建立逻辑回归模型
# #     model = logit("X" + str(example_dag['outcome_var']) + " ~ " + " + ".join([f"X{node}" for node in example_dag['backdoor_set']]), data=example_dag['data']).fit()

# #     # 显示模型结果
# #     print(model.summary())
# # Normalize the outcome variable to be within the unit interval [0, 1]
# # example_dag['data'][f"X{example_dag['outcome_var']}"] = (example_dag['data'][f"X{example_dag['outcome_var']}"] - example_dag['data'][f"X{example_dag['outcome_var']}"].min()) / (example_dag['data'][f"X{example_dag['outcome_var']}"].max() - example_dag['data'][f"X{example_dag['outcome_var']}"].min())



# # 建立逻辑回归模型
# model = logit(target, data=example_dag['data']).fit()
    
# # 显示模型结果
# print(model.summary())

In [9]:
# 自变量和因变量
X_num = [example_dag['intervention_var']] + example_dag['backdoor_set']
X = example_dag['data'][[f'X{j}' for j in X_num]]
y_num = example_dag['outcome_var']
y = example_dag['data'][f'X{y_num}']
# print(X)
# print(y)



model = LinearRegression()
model.fit(X, y)
# 计算因果效应
ace = model.coef_[0]
print("Estimated ACE:", ace)

# 存储估计的ACE
example_dag['estimated_ace'] = ace

# 打印model的系数
print(model.coef_)

Estimated ACE: 1.1381110515755477
[1.13811105]


In [10]:
# 建立线性回归模型
model = ols(target, data=example_dag['data']).fit()
    
# 显示模型结果
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                     X6   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                 1.227e+05
Date:                Thu, 17 Oct 2024   Prob (F-statistic):               0.00
Time:                        14:35:51   Log-Likelihood:            -3.1993e+05
No. Observations:              100000   AIC:                         6.399e+05
Df Residuals:                   99998   BIC:                         6.399e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0082      0.019     -0.437      0.6

可以再探讨一下置信区间的缩小速度，以及置信区间的大小与样本量的关系。