In [2]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import norm

In [3]:

class CausalGenerator:
    def __init__(self, graph, data,unit_vars,subunit_vars,sizes):
        self.graph = nx.DiGraph(graph)
        self.data = data
        self.processed_data = self.preprocess_data(unit_vars,subunit_vars,sizes)
        
    def preprocess_data(self,unit_vars,subunit_vars,sizes):
        # Aggregate subunit nodes
        data = {}
        
        for var in unit_vars:
            for i in range(len(sizes)):
                data[var+str(i)] = self.data[var+str(i)]
        for var in subunit_vars:
            for i in range(len(sizes)):
                s=0
                for j in range(sizes[i]):
                    s+=self.data['_'+var+str(i)+'_'+str(j)]
            data[var] = s/sizes[i]
        return data
    
    
    def generate(self, noise_dist, transition_funcs, sizes,special_node,intervention_value):
        generated = {}
        for node in nx.topological_sort(self.graph):
            if node is special_node:
                generated[node] = intervention_value
            else:
                for i in range(len(sizes)):
                    x=np.random.uniform()
                    parents = list(self.graph.predecessors(node))
                    if not parents:
                        generated[node+str(i)] = noise_dist[node](x)
                    else:
                        parent_values = [generated[p+str(i)] for p in parents]
                        generated[node+str(i)] = transition_funcs[node](*parent_values) + noise_dist[node](x)
        
        # Convert to pandas DataFrame
        return generated
        

In [4]:

# Example usage:
graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
data = {
    **{f'a{i}': i + 1 for i in range(100)},
    **{f'_b{i}_0': 2*i + 1 for i in range(100)},
    **{f'_b{i}_1': 2*i + 2 for i in range(100)},
    **{f'c{i}': i + 5 for i in range(100)},
    **{f'_d{i}_0': 2*i + 7 for i in range(100)},
    **{f'_d{i}_1': 2*i + 8 for i in range(100)},
    **{f'e{i}': i + 11 for i in range(100)}
}

unit_vars = ['a', 'c', 'e']
subunit_vars = ['d', 'b']
sizes = [2]*100

generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
noise_dist = {
    'a': lambda x: norm.ppf(x),
    '_b': lambda x: norm.ppf(x, loc=0, scale=1),
    'c': lambda x: norm.ppf(x, loc=0, scale=1),
    '_d': lambda x: norm.ppf(x, loc=0, scale=1),
    'e': lambda x: norm.ppf(x, loc=0, scale=1)
}

transition_funcs = {
    '_b': lambda a: a**2,
    'c': lambda a, b: a**3 + b**2,
    '_d': lambda b, c: b + c,
    'e': lambda c, d: c**2 + d
}


In [5]:

generated_data = generator.generate(noise_dist, transition_funcs,sizes,special_node='b',intervention_value=10)


In [6]:
print(generated_data)


{'a0': np.float64(0.14406805525111022), 'a1': np.float64(-0.7095851895757826), 'a2': np.float64(-0.07729443687610899), 'a3': np.float64(-0.15295215568409593), 'a4': np.float64(0.7274486391085693), 'a5': np.float64(-0.3716042095207129), 'a6': np.float64(0.5020421456787812), 'a7': np.float64(-0.2534833817522527), 'a8': np.float64(1.7250586291021488), 'a9': np.float64(-1.0227981693310215), 'a10': np.float64(0.6905409736708813), 'a11': np.float64(1.679737599156709), 'a12': np.float64(-2.480721339349634), 'a13': np.float64(2.556709805159919), 'a14': np.float64(-0.37015542340163415), 'a15': np.float64(0.20545305873313802), 'a16': np.float64(-1.377831894910225), 'a17': np.float64(0.26422181784549487), 'a18': np.float64(-0.6679235580729289), 'a19': np.float64(1.5380271395731409), 'a20': np.float64(0.06379939621835992), 'a21': np.float64(-1.112777206662602), 'a22': np.float64(-0.07384607080232589), 'a23': np.float64(0.26988068091473866), 'a24': np.float64(-0.34323241807791127), 'a25': np.float6

In [7]:
# Generate arrays for each variable
a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]

# Print the arrays
print("a arrays:", a_array)
print("b arrays:", b_array)
print("c arrays:", c_array)
print("d arrays:", d_array)
print("e arrays:", e_array)


a arrays: [np.float64(0.14406805525111022), np.float64(-0.7095851895757826), np.float64(-0.07729443687610899), np.float64(-0.15295215568409593), np.float64(0.7274486391085693), np.float64(-0.3716042095207129), np.float64(0.5020421456787812), np.float64(-0.2534833817522527), np.float64(1.7250586291021488), np.float64(-1.0227981693310215), np.float64(0.6905409736708813), np.float64(1.679737599156709), np.float64(-2.480721339349634), np.float64(2.556709805159919), np.float64(-0.37015542340163415), np.float64(0.20545305873313802), np.float64(-1.377831894910225), np.float64(0.26422181784549487), np.float64(-0.6679235580729289), np.float64(1.5380271395731409), np.float64(0.06379939621835992), np.float64(-1.112777206662602), np.float64(-0.07384607080232589), np.float64(0.26988068091473866), np.float64(-0.34323241807791127), np.float64(-0.4063450795779868), np.float64(-1.23623429828445), np.float64(0.3814099442000925), np.float64(-0.5909353886064977), np.float64(-1.9426603918621121), np.float6

In [8]:
hierarchical_e = norm.rvs(size = (100))
print(hierarchical_e)

[ 2.5910176  -0.33553151  1.33544994  1.21294902  0.18651158 -0.30153161
  1.37544808 -0.33128703  1.94222556 -0.30987358  1.30120523 -0.5915289
 -0.21747519  0.34120509  0.70127152  0.38219313 -0.45096513 -0.49036388
 -1.12521294 -0.47390203  0.31268393 -0.86653295  1.28674481  1.72482376
 -0.29751652 -0.18703953  0.01904448 -0.94766038 -1.35116006  2.29888991
 -0.6554178  -1.12946724 -1.08868459  0.26120312 -0.43314043 -0.69570684
  1.06652196 -1.92623286 -0.72160568  1.88261558 -1.59639663  0.03622111
  0.40687747  0.16977387  1.26188043 -0.59969337 -0.52343366 -0.29236533
  0.22004824  0.19271195  0.59127578  0.5388771  -0.39021733  0.44488654
  1.54932413  0.13664447  2.53257895  1.93855329 -0.25478093  0.45754868
 -0.21522429 -0.11357799  0.03898096  1.37058858 -1.20128043  1.78095568
  0.07463284 -0.37066548  0.73124883 -1.46621728  1.77263865  1.37976634
  1.75666313 -0.86153799  0.40668132  1.95433547 -1.14466232  0.44521338
 -0.4148829  -0.71730968  0.89862534  1.18327041  1.

In [10]:
import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

def kl_divergence(p, q):
    def integrand(x):
        px = p(x)
        qx = q(x)
        # Avoid log(0) by adding a small epsilon
        epsilon = 1e-10
        return np.where((px > 0) & (qx > 0),
                        px * (np.log(px + epsilon) - np.log(qx + epsilon)),
                        0)
    
    result, _ = quad(integrand, -np.inf, np.inf, limit=1000)
    return result



In [11]:

# Select two arrays for comparison
array1 = e_array  # First array from e_arrays
array2 = hierarchical_e

# Fit KDE to both distributions
kde1 = gaussian_kde(array1)
kde2 = gaussian_kde(array2)


In [12]:

# Compute KL divergence
kl_div = kl_divergence(kde1, kde2)

# Print the KL divergence
print(f"KL divergence between the two arrays: {kl_div}")
 # high kl div okay

KL divergence between the two arrays: 0.5170803719796013


In [13]:
random_array = norm.rvs(size=1000)
random_array_bis = norm.rvs(size=1000)

# Fit KDE to both distributions
kde_random = gaussian_kde(random_array)
kde_random_bis = gaussian_kde(random_array_bis)

# Compute KL divergence
kl_div_random = kl_divergence(kde_random, kde_random_bis)

# Print the KL divergence
print(f"KL divergence between random arrays: {kl_div_random}")

KL divergence between random arrays: 0.005416157978624237
