In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import norm

In [2]:

class CausalGenerator:
    def __init__(self, graph, data,unit_vars,subunit_vars,sizes):
        self.graph = nx.DiGraph(graph)
        self.data = data
        self.processed_data = self.preprocess_data(unit_vars,subunit_vars,sizes)
        
    def preprocess_data(self,unit_vars,subunit_vars,sizes):
        # Aggregate subunit nodes
        data = {}
        
        for var in unit_vars:
            for i in range(len(sizes)):
                data[var+str(i)] = self.data[var+str(i)]
        for var in subunit_vars:
            for i in range(len(sizes)):
                s=0
                for j in range(sizes[i]):
                    s+=self.data['_'+var+str(i)+'_'+str(j)]
            data[var] = s/sizes[i]
        return data
    
    
    def generate(self, noise_dist, transition_funcs, sizes):
        generated = {}
        for node in nx.topological_sort(self.graph):
            for i in range(len(sizes)):
                x=np.random.uniform()
                parents = list(self.graph.predecessors(node))
                if not parents:
                    generated[node+str(i)] = noise_dist[node](x)
                else:
                    parent_values = [generated[p+str(i)] for p in parents]
                    generated[node+str(i)] = transition_funcs[node](*parent_values) + noise_dist[node](x)
        
        # Convert to pandas DataFrame
        return generated
        

In [3]:

# Example usage:
graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
data = {
    **{f'a{i}': i + 1 for i in range(100)},
    **{f'_b{i}_0': 2*i + 1 for i in range(100)},
    **{f'_b{i}_1': 2*i + 2 for i in range(100)},
    **{f'c{i}': i + 5 for i in range(100)},
    **{f'_d{i}_0': 2*i + 7 for i in range(100)},
    **{f'_d{i}_1': 2*i + 8 for i in range(100)},
    **{f'e{i}': i + 11 for i in range(100)}
}

unit_vars = ['a', 'c', 'e']
subunit_vars = ['d', 'b']
sizes = [2]*100

generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
noise_dist = {
    'a': lambda x: norm.ppf(x),
    '_b': lambda x: norm.ppf(x, loc=0, scale=1),
    'c': lambda x: norm.ppf(x, loc=0, scale=1),
    '_d': lambda x: norm.ppf(x, loc=0, scale=1),
    'e': lambda x: norm.ppf(x, loc=0, scale=1)
}

transition_funcs = {
    '_b': lambda a: a**2,
    'c': lambda a, b: a**3 + b**2,
    '_d': lambda b, c: b + c,
    'e': lambda c, d: c**2 + d
}


In [4]:

generated_data = generator.generate(noise_dist, transition_funcs,sizes)

In [5]:
print(generated_data)


{'a0': 0.19577654717689152, 'a1': -2.110838067764282, 'a2': -1.7297355227372646, 'a3': -2.048127846931865, 'a4': 1.131530887701064, 'a5': 0.721015519238161, 'a6': 0.49923453518472966, 'a7': 0.6978888411437779, 'a8': -0.37927242035074754, 'a9': 1.797615369740876, 'a10': 3.034204453564139, 'a11': 0.7112249071925228, 'a12': -0.9870573581285548, 'a13': -1.3129260267259464, 'a14': 1.716179298569095, 'a15': -0.02930541684170286, 'a16': 0.6853236452262734, 'a17': -0.8584869263126287, 'a18': 1.307708133681632, 'a19': 0.17450361780865187, 'a20': -0.2161593889875354, 'a21': 0.17779650479855252, 'a22': -2.3348933004068053, 'a23': 0.014232353363771634, 'a24': -1.0085519367395972, 'a25': -0.1596564526767875, 'a26': -0.6930820537843752, 'a27': -0.44973160336751367, 'a28': -0.6213820845854524, 'a29': -0.38123615295389396, 'a30': 0.7995605285297084, 'a31': 0.8302414925407393, 'a32': 1.9399934747747, 'a33': 1.1451974616742842, 'a34': -1.0032175430991532, 'a35': -1.2616538911140585, 'a36': -0.1252879835

In [6]:
# Generate arrays for each variable
a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]

# Print the arrays
print("a arrays:", a_array)
print("b arrays:", b_array)
print("c arrays:", c_array)
print("d arrays:", d_array)
print("e arrays:", e_array)


a arrays: [0.19577654717689152, -2.110838067764282, -1.7297355227372646, -2.048127846931865, 1.131530887701064, 0.721015519238161, 0.49923453518472966, 0.6978888411437779, -0.37927242035074754, 1.797615369740876, 3.034204453564139, 0.7112249071925228, -0.9870573581285548, -1.3129260267259464, 1.716179298569095, -0.02930541684170286, 0.6853236452262734, -0.8584869263126287, 1.307708133681632, 0.17450361780865187, -0.2161593889875354, 0.17779650479855252, -2.3348933004068053, 0.014232353363771634, -1.0085519367395972, -0.1596564526767875, -0.6930820537843752, -0.44973160336751367, -0.6213820845854524, -0.38123615295389396, 0.7995605285297084, 0.8302414925407393, 1.9399934747747, 1.1451974616742842, -1.0032175430991532, -1.2616538911140585, -0.12528798353999293, 0.13029800870544497, -0.4866907447048723, 0.9977496730029937, 1.7074729368320687, 2.38315029185394, -1.386081866580623, -0.6717719662502153, -0.28437145368685846, 2.494855889064382, -0.2660393325301001, 1.0709443543661537, 1.21376

In [7]:
hierarchical_e = norm.rvs(size = (100))
print(hierarchical_e)

[-2.83568514e+00 -4.74965979e-01  7.23694616e-03 -1.36663159e+00
  4.62185188e-01 -6.21345487e-01  2.78969129e-03 -4.71233668e-01
  8.01822184e-02  1.46271528e+00  1.85179634e+00  1.03589782e+00
 -5.43338018e-01  2.07819059e-01  1.66409967e-01 -7.96225830e-01
  1.25205643e+00 -7.09022114e-01 -2.91195374e-01  1.11348327e-01
  1.03178292e+00  1.68132562e+00 -4.13254815e-01  2.15291542e-01
 -2.66857640e+00  2.22296376e-01  2.09581286e+00 -1.42570110e-02
 -1.36623564e-02  8.17540701e-01  4.53982866e-01  2.74172658e-01
  4.42416911e-01 -8.50158386e-01 -1.07488963e-01  1.43416302e+00
 -7.89425973e-02  7.70418255e-01  1.23748079e+00  7.86304818e-01
 -5.22653316e-01 -9.48791912e-01  2.05645221e+00  5.07568765e-01
  7.99731377e-01 -2.85409538e-01  2.56497949e-01 -1.00797640e-01
  1.47071250e+00 -1.12007213e+00  4.50756481e-01 -1.14115149e+00
 -7.95515008e-01  2.57820671e+00 -3.54706247e-04  4.12135050e-01
 -4.02792845e-02 -2.09837747e-01  8.80909436e-01 -3.59697874e-02
 -1.06514742e+00  1.16978

In [8]:
import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

def kl_divergence(p, q):
    def integrand(x):
        px = p(x)
        qx = q(x)
        # Avoid log(0) by adding a small epsilon
        epsilon = 1e-10
        return np.where((px > 0) & (qx > 0),
                        px * (np.log(px + epsilon) - np.log(qx + epsilon)),
                        0)
    
    result, _ = quad(integrand, -np.inf, np.inf, limit=1000)
    return result


In [9]:

# Select two arrays for comparison
array1 = e_array  # First array from e_arrays
array2 = hierarchical_e

# Fit KDE to both distributions
kde1 = gaussian_kde(array1)
kde2 = gaussian_kde(array2)


In [10]:

# Compute KL divergence
kl_div = kl_divergence(kde1, kde2)

# Print the KL divergence
print(f"KL divergence between the two arrays: {kl_div}")


KL divergence between the two arrays: 0.2717388776334021


In [11]:
random_array = norm.rvs(size=1000)
random_array_bis = norm.rvs(size=1000)

# Fit KDE to both distributions
kde_random = gaussian_kde(random_array)
kde_random_bis = gaussian_kde(random_array_bis)

# Compute KL divergence
kl_div_random = kl_divergence(kde_random, kde_random_bis)

# Print the KL divergence
print(f"KL divergence between random arrays: {kl_div_random}")

KL divergence between random arrays: 0.008096987172709721
