# Imports & Dependencies

In [1]:
import numpy as np
from scipy.special import expit as sigmoid
import igraph as ig
import random
import pandas as pd
from typing import Dict, List, Tuple, Callable, Union
import os
import json
from datetime import datetime
import networkx as nx
from sklearn.gaussian_process import GaussianProcessRegressor

In [2]:
from data_utils import (
    set_random_seed,
    is_dag,
    simulate_dag,
    simulate_parameter,
    simulate_linear_sem,
    simulate_nonlinear_sem,
    DataSimulator,  
)

# Setting up DataSimulator and Generating Datasets

In [3]:
base_simulator = DataSimulator()

In [4]:
#1. Simple Linear Gaussian
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="linear_gaussian_simple"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163909_linear_gaussian_simple_nodes5_samples1000/linear_gaussian_simple_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163909_linear_gaussian_simple_nodes5_samples1000/linear_gaussian_simple_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163909_linear_gaussian_simple_nodes5_samples1000/linear_gaussian_simple_config.json


In [5]:
filepath = 'simulated_data/linear_gaussian_simple/linear_gaussian_simple_data.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,X1,X2,X3,X4,X5
0,-0.148108,-0.270084,-0.217983,-1.499773,-1.019468
1,-0.787128,-0.014848,-1.122779,2.438146,0.5844
2,0.184474,0.673437,-0.884064,0.749718,0.275555
3,0.407061,-0.905479,-1.961088,0.76591,0.317197
4,2.564219,-0.179987,0.227717,-0.227827,1.648717


In [6]:
filepath = 'simulated_data/linear_gaussian_simple/linear_gaussian_simple_graph.npy'
true_dag_matrix = np.load(filepath)

print("Loaded true DAG matrix:")
print(true_dag_matrix)

Loaded true DAG matrix:
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]]


In [7]:
#2. Linear Non-Gaussian (Uniform Noise)
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='uniform',  # changed noise type
    edge_probability=0.3,
    prefix="linear_uniform_noise"
)



Data generation: linear, uniform, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163909_linear_uniform_noise_nodes5_samples1000/linear_uniform_noise_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163909_linear_uniform_noise_nodes5_samples1000/linear_uniform_noise_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163909_linear_uniform_noise_nodes5_samples1000/linear_uniform_noise_config.json


In [8]:
#3. Simple Non-linear (MLP)
base_simulator.generate_and_save_dataset(
    n_nodes=10, 
    n_samples=2000, 
    function_type='mlp',  # non-linear function
    noise_type='gaussian', # non-linear defaults to Gaussian noise
    edge_probability=0.2,
    prefix="nonlinear_mlp"
)

Data generation: mlp, gaussian, scale=1.0
Non-linear function requires gaussian noise
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163909_nonlinear_mlp_nodes10_samples2000/nonlinear_mlp_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163909_nonlinear_mlp_nodes10_samples2000/nonlinear_mlp_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163909_nonlinear_mlp_nodes10_samples2000/nonlinear_mlp_config.json


In [9]:
#4. Sparse Graph (low correlation) 
base_simulator.generate_and_save_dataset(
    n_nodes=15, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian',
    edge_probability=0.05,  # low edge probability
    prefix="sparse_linear_graph"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163909_sparse_linear_graph_nodes15_samples1000/sparse_linear_graph_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163909_sparse_linear_graph_nodes15_samples1000/sparse_linear_graph_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163909_sparse_linear_graph_nodes15_samples1000/sparse_linear_graph_config.json


In [10]:
#5. Dense Graph (high correlation) 
base_simulator.generate_and_save_dataset(
    n_nodes=15, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian',
    edge_probability=0.5,  # high edge probability
    graph_type='SF',  # Scale-Free graph (creates hubs and not just random)
    prefix="dense_scale_free_graph"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163909_dense_scale_free_graph_nodes15_samples1000/dense_scale_free_graph_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163909_dense_scale_free_graph_nodes15_samples1000/dense_scale_free_graph_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163909_dense_scale_free_graph_nodes15_samples1000/dense_scale_free_graph_config.json


In [11]:
#6. Mixed Data (continuous + discrete)
base_simulator.generate_and_save_dataset(
    n_nodes=10, 
    n_samples=1500,
    function_type='linear',
    noise_type='gaussian',
    edge_probability=0.3,
    discrete_ratio=0.5,  # 50% of all nodes will be discrete
    max_categories=4,
    prefix="mixed_data_discrete"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163910_mixed_data_discrete_nodes10_samples1500/mixed_data_discrete_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163910_mixed_data_discrete_nodes10_samples1500/mixed_data_discrete_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163910_mixed_data_discrete_nodes10_samples1500/mixed_data_discrete_config.json


In [12]:
# #7. Data with Missing Values 
# base_simulator.generate_and_save_dataset(
#     n_nodes=10, 
#     n_samples=1000,
#     function_type='linear',
#     noise_type='gaussian',
#     edge_probability=0.3,
#     add_missing_values=True,  # enabling missing values
#     missing_rate=0.1,  # 10% of data will be missing 
#     prefix="data_with_missing"
# )

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163910_data_with_missing_nodes10_samples1000/data_with_missing_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163910_data_with_missing_nodes10_samples1000/data_with_missing_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163910_data_with_missing_nodes10_samples1000/data_with_missing_config.json


In [13]:
#8. Data with Measurement Error
base_simulator.generate_and_save_dataset(
    n_nodes=10, 
    n_samples=1000,
    function_type='linear',
    noise_type='gaussian',
    edge_probability=0.3,
    add_measurement_error=True,  # enable measurement error
    error_rate=0.5,  # 50% of columns will have noise 
    error_std=0.4,   # sd of the added noise
    prefix="data_with_error"
)


Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163910_data_with_error_nodes10_samples1000/data_with_error_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163910_data_with_error_nodes10_samples1000/data_with_error_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163910_data_with_error_nodes10_samples1000/data_with_error_config.json


In [14]:
#9. Heterogeneous Data (multiple domains)
base_simulator.generate_and_save_dataset(
    n_nodes=10, 
    n_samples=2000,  # 500 samples per domain
    function_type='linear',
    noise_type='gaussian',
    edge_probability=0.2,
    n_domains=4,  # 4 distinct domains
    prefix="heterogeneous_multi_domain"
)

Data generation: linear, gaussian, scale=1.0
Domain affects 3 variables
Base correlation matrix: (10, 10)
Found 8 highly correlated pairs (base)
Final correlation matrix: (10, 10)
Found 8 highly correlated pairs (final)
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_163910_heterogeneous_multi_domain_nodes10_samples2000/heterogeneous_multi_domain_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_163910_heterogeneous_multi_domain_nodes10_samples2000/heterogeneous_multi_domain_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_163910_heterogeneous_multi_domain_nodes10_samples2000/heterogeneous_multi_domain_config.json


# SAMPLE SIZE EXPERIMENTS

In [25]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=500, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="sample_size_500"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164315_sample_size_500_nodes5_samples500/sample_size_500_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164315_sample_size_500_nodes5_samples500/sample_size_500_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164315_sample_size_500_nodes5_samples500/sample_size_500_config.json


In [26]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="sample_size_1000"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164315_sample_size_1000_nodes5_samples1000/sample_size_1000_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164315_sample_size_1000_nodes5_samples1000/sample_size_1000_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164315_sample_size_1000_nodes5_samples1000/sample_size_1000_config.json


In [27]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=2000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="sample_size_2000"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164316_sample_size_2000_nodes5_samples2000/sample_size_2000_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164316_sample_size_2000_nodes5_samples2000/sample_size_2000_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164316_sample_size_2000_nodes5_samples2000/sample_size_2000_config.json


In [28]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=5000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="sample_size_5000"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164317_sample_size_5000_nodes5_samples5000/sample_size_5000_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164317_sample_size_5000_nodes5_samples5000/sample_size_5000_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164317_sample_size_5000_nodes5_samples5000/sample_size_5000_config.json


In [29]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=10000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="sample_size_10000"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164319_sample_size_10000_nodes5_samples10000/sample_size_10000_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164319_sample_size_10000_nodes5_samples10000/sample_size_10000_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164319_sample_size_10000_nodes5_samples10000/sample_size_10000_config.json


# DENSITY EXPERIMENTS

In [30]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.1,
    prefix="density_1"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164404_density_1_nodes5_samples1000/density_1_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164404_density_1_nodes5_samples1000/density_1_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164404_density_1_nodes5_samples1000/density_1_config.json


In [31]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.3,
    prefix="density_3"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164420_density_3_nodes5_samples1000/density_3_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164420_density_3_nodes5_samples1000/density_3_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164420_density_3_nodes5_samples1000/density_3_config.json


In [32]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.5,
    prefix="density_5"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164432_density_5_nodes5_samples1000/density_5_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164432_density_5_nodes5_samples1000/density_5_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164432_density_5_nodes5_samples1000/density_5_config.json


In [33]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.7,
    prefix="density_7"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164443_density_7_nodes5_samples1000/density_7_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164443_density_7_nodes5_samples1000/density_7_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164443_density_7_nodes5_samples1000/density_7_config.json


In [34]:
base_simulator.generate_and_save_dataset(
    n_nodes=5, 
    n_samples=1000, 
    function_type='linear', 
    noise_type='gaussian', 
    edge_probability=0.9,
    prefix="density_9"
)

Data generation: linear, gaussian, scale=1.0
[92m✓ SUCCESS[0m Data saved: simulated_data/20251201_164452_density_9_nodes5_samples1000/density_9_data.csv
[92m✓ SUCCESS[0m Graph saved: simulated_data/20251201_164452_density_9_nodes5_samples1000/density_9_graph.npy
[92m✓ SUCCESS[0m Config saved: simulated_data/20251201_164452_density_9_nodes5_samples1000/density_9_config.json
