In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import norm

In [2]:

class CausalGenerator:
    def __init__(self, graph, data,unit_vars,subunit_vars,sizes):
        self.graph = nx.DiGraph(graph)
        self.data = data
        self.processed_data = self.preprocess_data(unit_vars,subunit_vars,sizes)
        
    def preprocess_data(self,unit_vars,subunit_vars,sizes):
        # Aggregate subunit nodes
        data = {}
        
        for var in unit_vars:
            for i in range(len(sizes)):
                data[var+str(i)] = self.data[var+str(i)]
        for var in subunit_vars:
            for i in range(len(sizes)):
                s=0
                for j in range(sizes[i]):
                    s+=self.data['_'+var+str(i)+'_'+str(j)]
            data[var] = s/sizes[i]
        return data
    
    
    def generate(self, noise_dist, transition_funcs, sizes,special_node,intervention_value):
        generated = {}
        for node in nx.topological_sort(self.graph):
            if node is special_node:
                generated[node] = intervention_value
            else:
                for i in range(len(sizes)):
                    x=np.random.uniform()
                    parents = list(self.graph.predecessors(node))
                    if not parents:
                        generated[node+str(i)] = noise_dist[node](x)
                    else:
                        parent_values = [generated[p+str(i)] for p in parents]
                        generated[node+str(i)] = transition_funcs[node](*parent_values) + noise_dist[node](x)
        
        # Convert to pandas DataFrame
        return generated
        

In [3]:

# Example usage:
graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
data = {
    **{f'a{i}': i + 1 for i in range(100)},
    **{f'_b{i}_0': 2*i + 1 for i in range(100)},
    **{f'_b{i}_1': 2*i + 2 for i in range(100)},
    **{f'c{i}': i + 5 for i in range(100)},
    **{f'_d{i}_0': 2*i + 7 for i in range(100)},
    **{f'_d{i}_1': 2*i + 8 for i in range(100)},
    **{f'e{i}': i + 11 for i in range(100)}
}

unit_vars = ['a', 'c', 'e']
subunit_vars = ['d', 'b']
sizes = [2]*100

generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
noise_dist = {
    'a': lambda x: norm.ppf(x),
    '_b': lambda x: norm.ppf(x, loc=0, scale=1),
    'c': lambda x: norm.ppf(x, loc=0, scale=1),
    '_d': lambda x: norm.ppf(x, loc=0, scale=1),
    'e': lambda x: norm.ppf(x, loc=0, scale=1)
}

transition_funcs = {
    '_b': lambda a: a**2,
    'c': lambda a, b: a**3 + b**2,
    '_d': lambda b, c: b + c,
    'e': lambda c, d: c**2 + d
}


In [4]:

generated_data = generator.generate(noise_dist, transition_funcs,sizes,special_node='b',intervention_value=10)


In [5]:
print(generated_data)


{'a0': 0.8019460475029023, 'a1': 0.19647189431152823, 'a2': 0.3691250528831172, 'a3': -0.8126723060271611, 'a4': 1.2288252075299015, 'a5': -0.004264445234261643, 'a6': -0.9344549281224039, 'a7': -1.0727105539211568, 'a8': 1.115980334566942, 'a9': 0.4198878812138359, 'a10': 1.3186162642167065, 'a11': -0.7940867420385671, 'a12': 0.8954463549348114, 'a13': -1.0823330977104226, 'a14': -1.710498958134208, 'a15': -0.20453874440602024, 'a16': 1.442635976944378, 'a17': 0.3578481779428491, 'a18': -0.8369474561627939, 'a19': -2.4844711422517745, 'a20': 2.026153963153642, 'a21': -0.5190857494441109, 'a22': -1.9510644272244746, 'a23': -0.5136740421969622, 'a24': -0.10523107883499837, 'a25': -2.529980944855494, 'a26': -0.6783555446546342, 'a27': 0.6761607370858389, 'a28': -1.7400218594436097, 'a29': -0.25314522989769456, 'a30': 0.36509433683571146, 'a31': 1.8650715663166781, 'a32': 0.801144918199794, 'a33': -0.5252071481934275, 'a34': 1.0220919224561522, 'a35': 0.5941296315290115, 'a36': 0.16620246

In [6]:
# Generate arrays for each variable
a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]

# Print the arrays
print("a arrays:", a_array)
print("b arrays:", b_array)
print("c arrays:", c_array)
print("d arrays:", d_array)
print("e arrays:", e_array)


a arrays: [0.8019460475029023, 0.19647189431152823, 0.3691250528831172, -0.8126723060271611, 1.2288252075299015, -0.004264445234261643, -0.9344549281224039, -1.0727105539211568, 1.115980334566942, 0.4198878812138359, 1.3186162642167065, -0.7940867420385671, 0.8954463549348114, -1.0823330977104226, -1.710498958134208, -0.20453874440602024, 1.442635976944378, 0.3578481779428491, -0.8369474561627939, -2.4844711422517745, 2.026153963153642, -0.5190857494441109, -1.9510644272244746, -0.5136740421969622, -0.10523107883499837, -2.529980944855494, -0.6783555446546342, 0.6761607370858389, -1.7400218594436097, -0.25314522989769456, 0.36509433683571146, 1.8650715663166781, 0.801144918199794, -0.5252071481934275, 1.0220919224561522, 0.5941296315290115, 0.16620246419149778, -2.061244061353485, -0.310615819568336, -0.3037444723914639, 0.5963943107741433, 0.16173408375099432, 1.1091570280123744, -1.5373286566475968, -0.28093569880370234, -0.19169400152853985, -1.2199537305231078, 1.485373710747162, 0

In [7]:
hierarchical_e = norm.rvs(size = (100))
print(hierarchical_e)

[-7.05521319e-01 -4.96022542e-01  3.34444409e-01 -1.40488371e+00
 -1.05009191e+00 -2.16688008e-01 -2.96095788e-02 -1.36878536e+00
  9.24608123e-01 -1.17362085e+00  7.08736549e-01  9.57207826e-01
 -5.01863259e-02  6.51917628e-02  8.64700201e-02 -1.71076264e+00
  1.60788085e+00  2.93188520e-01 -2.18660419e+00  5.27176192e-01
  1.34936632e+00 -1.41536760e+00  1.25316083e-01  5.72730572e-01
  7.34552576e-01 -3.19335468e-02  5.72054179e-02 -7.18459191e-01
 -1.23609744e+00  7.05497652e-01  1.35233032e+00 -2.31570827e+00
  1.86299551e+00  4.36207632e-02 -6.16689798e-01 -1.00108987e+00
 -3.41234836e-01  3.59606864e-01  9.27917122e-01  1.66282318e-01
  1.03055099e+00  4.99723798e-01  5.16032618e-01 -4.20535209e-01
  5.00632313e-01 -1.96355803e-01  1.96665765e+00 -3.05453815e-03
  1.27465291e-03  2.27099142e+00  5.02821974e-01 -7.32750283e-02
 -1.01508489e+00 -2.88319316e-01  9.87475248e-01  3.08454718e-02
  1.29385327e+00 -5.86578773e-01  2.27424682e-01 -2.64610217e-01
 -4.41353282e-01 -1.92508

In [8]:
import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

def kl_divergence(p, q):
    def integrand(x):
        px = p(x)
        qx = q(x)
        # Avoid log(0) by adding a small epsilon
        epsilon = 1e-10
        return np.where((px > 0) & (qx > 0),
                        px * (np.log(px + epsilon) - np.log(qx + epsilon)),
                        0)
    
    result, _ = quad(integrand, -np.inf, np.inf, limit=1000)
    return result



In [9]:

# Select two arrays for comparison
array1 = e_array  # First array from e_arrays
array2 = hierarchical_e

# Fit KDE to both distributions
kde1 = gaussian_kde(array1)
kde2 = gaussian_kde(array2)


In [10]:

# Compute KL divergence
kl_div = kl_divergence(kde1, kde2)

# Print the KL divergence
print(f"KL divergence between the two arrays: {kl_div}")
 # high kl div okay

KL divergence between the two arrays: 1.4658490482673832


In [11]:
random_array = norm.rvs(size=1000)
random_array_bis = norm.rvs(size=1000)

# Fit KDE to both distributions
kde_random = gaussian_kde(random_array)
kde_random_bis = gaussian_kde(random_array_bis)

# Compute KL divergence
kl_div_random = kl_divergence(kde_random, kde_random_bis)

# Print the KL divergence
print(f"KL divergence between random arrays: {kl_div_random}")

KL divergence between random arrays: 0.013152568031716232
