In [None]:
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import norm

In [None]:

class CausalGenerator:
    def __init__(self, graph, data,unit_vars,subunit_vars,sizes):
        self.graph = nx.DiGraph(graph)
        self.data = data
        self.processed_data = self.preprocess_data(unit_vars,subunit_vars,sizes)
        
    def preprocess_data(self,unit_vars,subunit_vars,sizes):
        # Aggregate subunit nodes
        data = {}
        
        for var in unit_vars:
            for i in range(len(sizes)):
                data[var+str(i)] = self.data[var+str(i)]
        for var in subunit_vars:
            for i in range(len(sizes)):
                s=0
                for j in range(sizes[i]):
                    s+=self.data['_'+var+str(i)+'_'+str(j)]
            data[var] = s/sizes[i]
        return data
    
    
    def generate(self, noise_dist, transition_funcs, sizes,special_node,intervention_value):
        generated = {}
        for node in nx.topological_sort(self.graph):
            if node is special_node:
                generated[node] = intervention_value
            else:
                for i in range(len(sizes)):
                    x=np.random.uniform()
                    parents = list(self.graph.predecessors(node))
                    if not parents:
                        generated[node+str(i)] = noise_dist[node](x)
                    else:
                        parent_values = [generated[p+str(i)] for p in parents]
                        generated[node+str(i)] = transition_funcs[node](*parent_values) + noise_dist[node](x)
        
        # Convert to pandas DataFrame
        return generated
        

In [None]:
import json

def load_data_from_json(file_path):
    """
    Load data from a JSON file and return it as a dictionary.
    
    :param file_path: Path to the JSON file
    :return: Dictionary containing the loaded data
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        print(f"Data successfully loaded from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None



In [None]:

# Example usage:
graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
data = load_data_from_json('data/sampled_data.json')

unit_vars = ['a', 'c', 'e']
subunit_vars = ['d', 'b']
sizes = [50]*50

generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
noise_dist = {
    'a': lambda x: norm.ppf(x),
    '_b': lambda x: norm.ppf(x, loc=0, scale=1),
    'c': lambda x: norm.ppf(x, loc=0, scale=1),
    '_d': lambda x: norm.ppf(x, loc=0, scale=1),
    'e': lambda x: norm.ppf(x, loc=0, scale=1)
}

transition_funcs = {
    '_b': lambda a: a,
    'c': lambda a, b: a**3 + (b+1)**2,
    '_d': lambda b, c: b+1 - c,
    'e': lambda c, d: c + d+1
}

In [None]:

generated_data = generator.generate(noise_dist, transition_funcs,sizes,special_node='b',intervention_value=0)


In [None]:
print(generated_data)


In [None]:
# Generate arrays for each variable
a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]

# Print the arrays
print("a arrays:", a_array)
print("b arrays:", b_array)
print("c arrays:", c_array)
print("d arrays:", d_array)
print("e arrays:", e_array)


In [None]:
import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

In [None]:

def kl_divergence(p, q, bandwidth='scott'):
    """
    Calcule la divergence KL entre deux distributions empiriques représentées par des tableaux,
    en utilisant l'estimation de densité par noyau.
    
    :param p: Premier tableau de données
    :param q: Second tableau de données
    :param bandwidth: Méthode pour estimer la largeur de bande ('scott', 'silverman' ou un nombre)
    :return: Valeur de la divergence KL
    """
    # Assurez-vous que les tableaux ont la même taille
    min_len = min(len(p), len(q))
    p = p[:min_len]
    q = q[:min_len]
    
    # Estimation de densité par noyau
    kde_p = gaussian_kde(p, bw_method=bandwidth)
    kde_q = gaussian_kde(q, bw_method=bandwidth)
    
    # Créez un espace d'échantillonnage
    x = np.linspace(min(np.min(p), np.min(q)), max(np.max(p), np.max(q)), 10000)
    
    # Estimez les densités
    p_density = kde_p(x)
    q_density = kde_q(x)
    
    # Ajoutez un petit epsilon pour éviter la division par zéro
    epsilon = 1e-10
    p_density += epsilon
    q_density += epsilon
    
    # Normalisez les densités
    p_density /= np.sum(p_density)
    q_density /= np.sum(q_density)
    
    # Calculez la divergence KL
    return np.sum(p_density * np.log(p_density / q_density))



In [None]:

# Select two arrays for comparison
array1 = e_array  # First array from e_arrays
array2 = [data[f'e{i}'] for i in range(50)]  # Second array from the original data

arrayA = [data[f'a{i}'] for i in range(50)]  # Second array from the original data
arrayB = np.array([np.mean(np.array([data[f'_b{i}_{j}'] for j in range(50)])) for i in range(50)])  # Second array from the original data
arrayC = [data[f'c{i}'] for i in range(50)]  # Second array from the original data
arrayD = np.array([np.mean(np.array([data[f'_d{i}_{j}'] for j in range(50)])) for i in range(50)])  # Second array from the original data
arrayE = [data[f'e{i}'] for i in range(50)]  # Second array from the original data

arrayB_full = np.array([[data[f'_b{i}_{j}'] for i in range(50)] for j in range(50)])  # Second array from the original data
arrayD_full = np.array([[data[f'_d{i}_{j}'] for i in range(50)] for j in range(50)])  # Second array from the original data

In [None]:

# Compute KL divergence
kl_div = kl_divergence(array1, array2)

# Print the KL divergence
print(f"KL divergence between the two arrays E: {kl_div}")
# high kl div okay
 
kl_div = kl_divergence(arrayA, a_array)
print(f"KL divergence between the two arrays A: {kl_div}")

kl_div = kl_divergence(arrayB, b_array)
print(f"KL divergence between the two arrays B: {kl_div}")

kl_div = kl_divergence(arrayC, c_array)
print(f"KL divergence between the two arrays C: {kl_div}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_arrays_with_kde(original, resampled, title):
    plt.close('all')
    plt.figure(figsize=(12, 6))
    
    # Plot original data
    sns.kdeplot(original, shade=True, color="blue", label="Original")
    plt.scatter(original, [0]*len(original), color="blue", alpha=0.5)
    
    # Plot resampled data
    sns.kdeplot(resampled, shade=True, color="red", label="Resampled")
    plt.scatter(resampled, [0]*len(resampled), color="red", alpha=0.5)
    
    plt.title(title)
    plt.xlabel("Value")
    plt.ylabel("Density")
    plt.legend()
    plt.show()

# Plot for E
plot_arrays_with_kde(array2, array1, "Distribution of E: Original vs Resampled")

# Plot for A
plot_arrays_with_kde(arrayA, a_array, "Distribution of A: Original vs Resampled")

# Plot for B
plot_arrays_with_kde(arrayB, b_array, "Distribution of B: Original vs Resampled")

# Plot for C
plot_arrays_with_kde(arrayC, c_array, "Distribution of C: Original vs Resampled")


In [None]:
def experiment(n):
    kldivs= []
    for k in range(n):
        # Example usage:
        graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
        data = load_data_from_json(f'data/sampled_data_{k}.json')
        print(str(k) + " is in progress")
        
        unit_vars = ['a', 'c', 'e']
        subunit_vars = ['d', 'b']
        sizes = [50]*50
        
        generator = CausalGenerator(graph, data,unit_vars,subunit_vars,sizes)
        noise_dist = {
            'a': lambda x: norm.ppf(x),
            '_b': lambda x: norm.ppf(x, loc=0, scale=1),
            'c': lambda x: norm.ppf(x, loc=0, scale=1),
            '_d': lambda x: norm.ppf(x, loc=0, scale=1),
            'e': lambda x: norm.ppf(x, loc=0, scale=1)
        }
        
        transition_funcs = {
            '_b': lambda a: (a+1)**2,
            'c': lambda a, b: a**3 + (b+1)**2,
            '_d': lambda b, c: (b+1)**2 - c**3,
            'e': lambda c, d: c + (d+1)**2
        }
        
            
        generated_data = generator.generate(noise_dist, transition_funcs,sizes,special_node='b',intervention_value=0)
            # Generate arrays for each variable
        a_array = [generated_data[f'a{k}'] for k in range(len(sizes))]
        b_array = [generated_data[f'_b{k}'] for k in range(len(sizes))]
        c_array = [generated_data[f'c{k}'] for k in range(len(sizes))]
        d_array = [generated_data[f'_d{k}'] for k in range(len(sizes))]
        e_array = [generated_data[f'e{k}'] for k in range(len(sizes))]
            
            
        # Select two arrays for comparison
        array1 = e_array  # First array from e_arrays
        array2 = [data[f'e{i}'] for i in range(50)]  # Second array from the original data
        
        arrayA = [data[f'a{i}'] for i in range(50)]  # Second array from the original data
        arrayB = [np.mean(np.array([data[f'_b{i}_{j}'] for j in range(50)])) for i in range(50)]  # Second array from the original data
        arrayC = [data[f'c{i}'] for i in range(50)]  # Second array from the original data
        
        arrayD = [np.mean(np.array([data[f'_d{i}_{j}'] for j in range(50)])) for i in range(50)]  # Second array from the original data
        
        kldiv = {'c': kl_divergence(arrayC, c_array), 'b': kl_divergence(arrayB, b_array), 'a': kl_divergence(arrayA, a_array), 'd': kl_divergence(arrayD, d_array), 'e': kl_divergence(array2, array1)}
        
        
        kldivs.append(kldiv)
        
    return kldivs





    

In [None]:
kldivs = experiment(100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data for plotting
variables = list(kldivs[0].keys())  # Assuming kldivs is a list of dictionaries

# Create subplots for each variable
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.flatten()

for i, var in enumerate(variables):
    # Extract KL divergence values for the current variable
    kl_values = [kldiv[var] for kldiv in kldivs]
    
    # Create a KDE plot for the current variable
    sns.kdeplot(kl_values, ax=axes[i], shade=True, color='skyblue')
    
    # Customize the subplot
    axes[i].set_title(f'KL Divergence Distribution for Variable {var}')
    axes[i].set_xlabel('KL Divergence')
    axes[i].set_ylabel('Density')

# Remove the empty subplot
fig.delaxes(axes[5])

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
import numpy as np

# Calculate mean and standard deviation for each variable
for var in variables:
    kl_values = [kldiv[var] for kldiv in kldivs]
    mean = np.mean(kl_values)
    std = np.std(kl_values)
    print(f"Variable {var}:")
    print(f"  Mean KL Divergence: {mean:.4f}")
    print(f"  Standard Deviation: {std:.4f}")
    print()


In [None]:
print(arrayB_full)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming we have arrays arrayB_full, arrayD_full, b_array, and d_array
# arrayB_full and arrayD_full are the true values, b_array and d_array are the predicted values

# Colors for true and predicted values
true_color = 'blue'
pred_color = 'red'

for i in range(50):
    # Create a new figure for each unit
    plt.figure(figsize=(6, 4))
    
    # Plot true b and d values
    plt.scatter(arrayB_full[i,:], arrayD_full[i,:], color=true_color, label='True', marker='o')
    
    # Plot predicted b and d values
    # Check if the predicted values are scalars or arrays
    if np.isscalar(b_array[i]) and np.isscalar(d_array[i]):
        plt.scatter(b_array[i], d_array[i], color='black', label='Predicted', marker='x')
    else:
        plt.scatter(b_array[i], d_array[i], color='black', label='Predicted', marker='x')
    
    # Set title and labels for the plot
    plt.title(f'Unit {i}')
    plt.xlabel('B')
    plt.ylabel('D')
    plt.legend()
    
    # Adjust the layout and display the plot
    plt.tight_layout()
    plt.show()

# Explanation of the code:
print("This code generates scatter plots for 50 units, comparing true and predicted values of B and D.")
print("Each plot represents a single unit, with blue circles (o) for true values and black crosses (x) for predicted values.")
print("The true values are taken from arrayB_full and arrayD_full, while predicted values come from b_array and d_array.")
print("This visualization helps in understanding how well the predictions match the true values for each unit.")


AUC ROC

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import norm
from scipy.special import kl_div


In [None]:


def experiment(n):
    kldivs = []
    roc_auc_scores = []
    
    for k in range(n):
        print(f"{k} is in progress")
        
        # Load data and set up the experiment
        graph = [('a', '_b'), ('a', 'c'), ('_b', 'c'), ('c', '_d'), ('_b', '_d'), ('_d', 'e'), ('c', 'e')]
        data = load_data_from_json(f'data/sampled_data_{k}.json')
        
        unit_vars = ['a', 'c', 'e']
        subunit_vars = ['d', 'b']
        sizes = [50]*50
        
        generator = CausalGenerator(graph, data, unit_vars, subunit_vars, sizes)
        
        noise_dist = {
            'a': lambda x: norm.ppf(x),
            '_b': lambda x: norm.ppf(x, loc=0, scale=1),
            'c': lambda x: norm.ppf(x, loc=0, scale=1),
            '_d': lambda x: norm.ppf(x, loc=0, scale=1),
            'e': lambda x: norm.ppf(x, loc=0, scale=1)
        }
        
        transition_funcs = {
            '_b': lambda a: (a+1)**2,
            'c': lambda a, b: a**3 + (b+1)**2,
            '_d': lambda b, c: (b+1)**2 - c**3,
            'e': lambda c, d: c + (d+1)**2
        }
        
        generated_data = generator.generate(noise_dist, transition_funcs, sizes, special_node='b', intervention_value=0)
        
        # Generate arrays for each variable
        a_array = np.array([generated_data[f'a{i}'] for i in range(len(sizes))])
        b_array = np.array([generated_data[f'_b{i}'] for i in range(len(sizes))])
        c_array = np.array([generated_data[f'c{i}'] for i in range(len(sizes))])
        d_array = np.array([generated_data[f'_d{i}'] for i in range(len(sizes))])
        e_array = np.array([generated_data[f'e{i}'] for i in range(len(sizes))])
        
        # Get arrays from original data
        arrayA = np.array([data[f'a{i}'] for i in range(50)])
        arrayB = np.array([np.mean([data[f'_b{i}_{j}'] for j in range(50)]) for i in range(50)])
        arrayC = np.array([data[f'c{i}'] for i in range(50)])
        arrayD = np.array([np.mean([data[f'_d{i}_{j}'] for j in range(50)]) for i in range(50)])
        arrayE = np.array([data[f'e{i}'] for i in range(50)])
        
        # Calculate KL divergence
        kldiv = {
            'a': kl_divergence(arrayA, a_array),
            'b': kl_divergence(arrayB, b_array),
            'c': kl_divergence(arrayC, c_array),
            'd': kl_divergence(arrayD, d_array),
            'e': kl_divergence(arrayE, e_array)
        }
        kldivs.append(kldiv)
        
        # Prepare data for KNN classification
        X = np.concatenate([a_array, b_array, c_array, d_array, e_array, 
                            arrayA, arrayB, arrayC, arrayD, arrayE])
        y = np.concatenate([np.ones(len(a_array) * 5), np.zeros(len(arrayA) * 5)])
        
        # Split data for training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Train KNN classifier
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(X_train.reshape(-1, 1), y_train)
        
        # Predict and calculate ROC AUC score
        y_pred = knn.predict_proba(X_test.reshape(-1, 1))[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred)
        roc_auc_scores.append(roc_auc)
        
        print(f"ROC AUC Score for iteration {k}: {roc_auc}")
    
    # Print average ROC AUC score
    avg_roc_auc = np.mean(roc_auc_scores)
    print(f"Average ROC AUC Score: {avg_roc_auc}")
    
    return kldivs, roc_auc_scores


In [None]:

# Run the experiment
n_iterations = 100  # You can adjust this number
kldivs, roc_auc_scores = experiment(n_iterations)

# Plot ROC AUC scores
plt.figure(figsize=(10, 6))
plt.plot(range(n_iterations), roc_auc_scores, marker='o')
plt.title('ROC AUC Scores across Iterations')
plt.xlabel('Iteration')
plt.ylabel('ROC AUC Score')
plt.grid(True)
plt.show()

# Plot KL divergences
plt.figure(figsize=(12, 6))
for var in ['a', 'b', 'c', 'd', 'e']:
    kl_values = [kldiv[var] for kldiv in kldivs]
    plt.plot(range(n_iterations), kl_values, marker='o', label=f'Variable {var}')
plt.title('KL Divergences across Iterations')
plt.xlabel('Iteration')
plt.ylabel('KL Divergence')
plt.legend()
plt.grid(True)
plt.show()
