In [10]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import norm

In [16]:

class CausalGenerator:
    def __init__(self, graph, data,unit_vars,subunit_vars,sizes):
        self.graph = nx.DiGraph(graph)
        self.data = data
        self.processed_data = self.preprocess_data(unit_vars,subunit_vars,sizes)
        
    def preprocess_data(self,unit_vars,subunit_vars,sizes):
        # Aggregate subunit nodes
        data = {}
        
        for var in unit_vars:
            for i in range(len(sizes)):
                data[var+str(i)] = self.data[var+str(i)]
        for var in subunit_vars:
            for i in range(len(sizes)):
                s=0
                for j in range(sizes[i]):
                    s+=self.data['_'+var+str(i)+'_'+str(j)]
            data[var] = s/sizes[i]
        return data
    
    
    def generate(self, noise_dist, transition_funcs, sizes):
        generated = {}
        for node in nx.topological_sort(self.graph):
            for i in range(len(sizes)):
                x=np.random.uniform()
                parents = list(self.graph.predecessors(node))
                if not parents:
                    generated[node+str(i)] = noise_dist[node](x)
                else:
                    parent_values = [generated[p+str(i)] for p in parents]
                    generated[node+str(i)] = transition_funcs[node](*parent_values) + noise_dist[node](x)
        
        # Convert to pandas DataFrame
        return generated
        

In [30]:

# Example usage:
graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
data = {
    **{f'a{i}': i + 1 for i in range(100)},
    **{f'_b{i}_0': 2*i + 1 for i in range(100)},
    **{f'_b{i}_1': 2*i + 2 for i in range(100)},
    **{f'c{i}': i + 5 for i in range(100)},
    **{f'_d{i}_0': 2*i + 7 for i in range(100)},
    **{f'_d{i}_1': 2*i + 8 for i in range(100)},
    **{f'e{i}': i + 11 for i in range(100)}
}

unit_vars = ['a', 'c', 'e']
subunit_vars = ['d', 'b']
sizes = [2]*100

generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
noise_dist = {
    'a': lambda x: norm.ppf(x),
    '_b': lambda x: norm.ppf(x, loc=0, scale=1),
    'c': lambda x: norm.ppf(x, loc=0, scale=1),
    '_d': lambda x: norm.ppf(x, loc=0, scale=1),
    'e': lambda x: norm.ppf(x, loc=0, scale=1)
}

transition_funcs = {
    '_b': lambda a: a**2,
    'c': lambda a, b: a**3 + b**2,
    '_d': lambda b, c: b + c,
    'e': lambda c, d: c**2 + d
}


In [31]:

generated_data = generator.generate(noise_dist, transition_funcs,sizes)

In [32]:
print(generated_data)


{'a0': np.float64(-0.8405966554634715), 'a1': np.float64(-0.816668976552195), 'a2': np.float64(-0.09713088827186747), 'a3': np.float64(-0.21157706774136706), 'a4': np.float64(1.0899938374397242), 'a5': np.float64(0.033996448288420626), 'a6': np.float64(0.22350732900601636), 'a7': np.float64(-0.27819882858775863), 'a8': np.float64(0.1659275156770261), 'a9': np.float64(0.6491087571019644), 'a10': np.float64(-0.002580793033558013), 'a11': np.float64(-0.008641467261577938), 'a12': np.float64(0.683398782612874), 'a13': np.float64(-0.7304079480342294), 'a14': np.float64(1.5374494339038511), 'a15': np.float64(-1.7096164517289036), 'a16': np.float64(0.8646876869674799), 'a17': np.float64(-0.13498117545136065), 'a18': np.float64(0.47449861023251527), 'a19': np.float64(0.8331972713200815), 'a20': np.float64(-0.08310679823809225), 'a21': np.float64(-0.8922662198460731), 'a22': np.float64(-0.9364517962225768), 'a23': np.float64(1.104951673584388), 'a24': np.float64(-2.6203425666061313), 'a25': np.

In [36]:
# Generate arrays for each variable
a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]

# Print the arrays
print("a arrays:", a_array)
print("b arrays:", b_array)
print("c arrays:", c_array)
print("d arrays:", d_array)
print("e arrays:", e_array)


a arrays: [np.float64(-0.8405966554634715), np.float64(-0.816668976552195), np.float64(-0.09713088827186747), np.float64(-0.21157706774136706), np.float64(1.0899938374397242), np.float64(0.033996448288420626), np.float64(0.22350732900601636), np.float64(-0.27819882858775863), np.float64(0.1659275156770261), np.float64(0.6491087571019644), np.float64(-0.002580793033558013), np.float64(-0.008641467261577938), np.float64(0.683398782612874), np.float64(-0.7304079480342294), np.float64(1.5374494339038511), np.float64(-1.7096164517289036), np.float64(0.8646876869674799), np.float64(-0.13498117545136065), np.float64(0.47449861023251527), np.float64(0.8331972713200815), np.float64(-0.08310679823809225), np.float64(-0.8922662198460731), np.float64(-0.9364517962225768), np.float64(1.104951673584388), np.float64(-2.6203425666061313), np.float64(-0.7835489423387493), np.float64(-1.05886354025282), np.float64(-0.369987949269593), np.float64(1.7192820370205046), np.float64(-0.44793068855611534), np.

In [37]:
hierarchical_e = norm.rvs(size = (100))
print(hierarchical_e)

[-1.52064821  0.97092662  1.15626117 -0.33484479  0.20898511 -1.37340526
 -0.76839208  0.04509756 -0.1227677  -1.10507944 -1.22200944  1.27011838
 -0.91736903 -3.24915957 -0.87436217  2.23132691  1.42912108  1.41455965
  1.59148482 -0.63955011 -0.89660106 -0.49092021  2.25118427 -1.20534906
 -2.33937218 -0.97067951  1.61671232 -1.25311302 -1.23201143  0.05031666
  0.31512262 -0.34822232 -0.43377763 -0.47750058  0.83683834 -0.2507122
 -0.15952878  0.59555262 -0.13654372  0.26408847 -0.17117534 -1.1663673
  0.47106672  1.12193332  1.58343366  1.04602986  1.54783904 -0.20992299
  1.99996664 -0.82755713  0.24659565 -1.24501684 -0.73992864  1.27651853
  1.66734654  0.66930317  1.65046585 -0.85378099  0.09009943 -0.0735332
  2.78606663  0.96840325  0.8449556  -0.23101827  0.71724349 -0.84893171
  0.78204404 -2.28815112  0.9000088   0.76901592  1.57917207  0.36104573
  1.4115278  -0.18452143 -0.0444946   0.73479944 -0.21766745 -1.70595126
  0.53571174 -0.84816043 -0.93806017 -1.93240853  1.98

In [49]:
import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

def kl_divergence(p, q):
    def integrand(x):
        px = p(x)
        qx = q(x)
        # Avoid log(0) by adding a small epsilon
        epsilon = 1e-10
        return np.where((px > 0) & (qx > 0),
                        px * (np.log(px + epsilon) - np.log(qx + epsilon)),
                        0)
    
    result, _ = quad(integrand, -np.inf, np.inf, limit=1000)
    return result


In [50]:

# Select two arrays for comparison
array1 = e_array  # First array from e_arrays
array2 = hierarchical_e

# Fit KDE to both distributions
kde1 = gaussian_kde(array1)
kde2 = gaussian_kde(array2)


In [51]:

# Compute KL divergence
kl_div = kl_divergence(kde1, kde2)

# Print the KL divergence
print(f"KL divergence between the two arrays: {kl_div}")


KL divergence between the two arrays: 0.20017470879573696


In [48]:
random_array = norm.rvs(size=1000)
random_array_bis = norm.rvs(size=1000)

# Fit KDE to both distributions
kde_random = gaussian_kde(random_array)
kde_random_bis = gaussian_kde(random_array_bis)

# Compute KL divergence
kl_div_random = kl_divergence(kde_random, kde_random_bis)

# Print the KL divergence
print(f"KL divergence between random arrays: {kl_div_random}")

KL divergence between random arrays: 0.009543645788831583
