In [None]:
import sys
import os

print("Current Working Directory:", os.getcwd())

# Add the examples directory to sys.path so we can import the set_random_seed function and other utilities from causal_helpers
examples_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'examples'))
sys.path.append(examples_dir)
# Now import set_random_seed directly from causal_helpers
from causal_helpers import set_random_seed

# Add the data directory to sys.path so we can save and load data files
data_dir = os.path.abspath(os.path.join(examples_dir, '..', 'data'))

# Use the function
seed = 41 # main seed
set_random_seed(seed)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("Current Working Directory:", os.getcwd())

In [2]:
def simulate_parameter(B, w_ranges=((-2.0, -0.5), (0.5, 2.0))):
    """Simulate SEM parameters for a DAG.

    Args:
        B (np.ndarray): [d, d] binary adj matrix of DAG
        w_ranges (tuple): disjoint weight ranges

    Returns:
        W (np.ndarray): [d, d] weighted adj matrix of DAG
    """
    W = np.zeros(B.shape)
    S = np.random.randint(len(w_ranges), size=B.shape)  # which range
    for i, (low, high) in enumerate(w_ranges):
        U = np.random.uniform(low=low, high=high, size=B.shape)
        W += B * (S == i) * U
    return W

def sigmoid(x):
    s = 1/(1+np.exp(-x))
    return s

def generate_custom_mixed_data(num_samples, w_ranges=((-2.0, -0.5), (0.5, 2.0)), noise_scale=1.0, softplus=True):
    """
    Generate mixed observational data with continuous and binary variables based on a predefined causal structure.

    Parameters:
        num_samples (int): Number of samples to generate.
        w_ranges (tuple): Ranges for sampling linear weights.
        noise_scale (float): Standard deviation of additive noise.
        softplus (bool): Whether to apply softplus transformation to the noise.

    Returns:
        W (np.ndarray): Weighted adjacency matrix of the DAG.
        adjacency_matrix (np.ndarray): Adjacency matrix of the DAG.
        data (pd.DataFrame): Simulated data.
    """

    # Define the adjacency matrix for 10 nodes
    num_nodes = 10
    adjacency_matrix = np.zeros((num_nodes, num_nodes))
    adjacency_matrix[3, 0] = 1  # x3 -> x0
    adjacency_matrix[3, 2] = 1  # x3 -> x2

    adjacency_matrix[2, 1] = 1  # x2 -> x1
    adjacency_matrix[2, 4] = 1  # x2 -> x4
    
    adjacency_matrix[0, 1] = 1  # x0 -> x1    
    adjacency_matrix[0, 5] = 1  # x0 -> x5
    adjacency_matrix[0, 4] = 1  # x0 -> x4
    
    adjacency_matrix[4, 6] = 1  # x4 -> x6
    adjacency_matrix[5, 7] = 1  # x5 -> x7
    adjacency_matrix[6, 8] = 1  # x6 -> x8
    adjacency_matrix[7, 9] = 1  # x7 -> x9

    # Generate weights for linear equations
    W = simulate_parameter(adjacency_matrix, w_ranges)

    # Helper function for linear SEM
    def simulate_linear(parent_vars, weights, noise_scale, softplus):
        noise = np.random.normal(scale=noise_scale, size=parent_vars.shape[0])
        if softplus:
            noise = np.log(1 + np.exp(noise))  # Apply softplus transformation
        return parent_vars @ weights + noise

    # Initialize data dictionary
    data = {}

    # Root nodes
    data["x3"] = np.random.uniform(size=num_samples)  # Continuous
    data["x0"] = simulate_linear(data["x3"].reshape(-1, 1), W[[3], 0], noise_scale, softplus)
    data["x2"] = simulate_linear(data["x3"].reshape(-1, 1), W[[3], 2], noise_scale, softplus)

    # Compute x1
    parent_vars_x1 = np.column_stack([data["x0"], data["x2"]])
    data["x1"] = simulate_linear(parent_vars_x1, W[[0, 2], 1], noise_scale, softplus)

    # Compute x5
    data["x5"] = simulate_linear(data["x0"].reshape(-1, 1), W[[0], 5], noise_scale, softplus)

    # Compute x4
    parent_vars_x4 = np.column_stack([data["x0"], data["x2"]])
    data["x4"] = simulate_linear(parent_vars_x4, W[[0, 2], 4], noise_scale, softplus)

    # Compute downstream variables x6 to x9
    parent_vars_x6 = data["x4"].reshape(-1, 1)
    data["x6"] = simulate_linear(parent_vars_x6, W[[4], 6], noise_scale, softplus)

    parent_vars_x7 = data["x5"].reshape(-1, 1)
    data["x7"] = simulate_linear(parent_vars_x7, W[[5], 7], noise_scale, softplus)

    parent_vars_x8 = data["x6"].reshape(-1, 1)
    data["x8"] = simulate_linear(parent_vars_x8, W[[6], 8], noise_scale, softplus)

    parent_vars_x9 = data["x7"].reshape(-1, 1)
    data["x9"] = simulate_linear(parent_vars_x9, W[[7], 9], noise_scale, softplus)

    # Convert some variables to binary
    binary_nodes = ["x0", "x1", "x2", "x5", "x7"]
    for node in binary_nodes:
        probabilities = (np.tanh(data[node]) + 1) / 2
        data[node] = np.random.binomial(1, probabilities)

    # Convert to DataFrame and return
    data_df = pd.DataFrame(data)[[f"x{i}" for i in range(num_nodes)]]

    return W, adjacency_matrix, data_df

In [None]:

# Example usage
W, adjacency_matrix, data = generate_custom_mixed_data(num_samples=10000, softplus=True)
print("Adjacency Matrix:\n", adjacency_matrix)
print("\nSample Data:\n", data.head())


In [None]:
# Example usage
num_samples = 4000
softplus = True
w_ranges=((-2.0, -1.0), (1.0, 2.0))
W, adj_matrix, data = generate_mixed_data_binary(num_samples=num_samples, w_ranges=w_ranges, softplus=softplus)

# Display the adjacency matrix and a preview of the data
print("Adjacency Matrix:")
print(adj_matrix)
print("\nGenerated Data:")
print(data.head())
print("\nTrue Weights:")
print(W)

In [4]:
dir_name = os.path.join(data_dir, "custom_mixed_confounding")

if not softplus:
    os.makedirs("custom_mixed_confounding", exist_ok=True)
else:
    dir_name += "_softplus"
    os.makedirs(dir_name, exist_ok=True)

# store the adjacency matrix as a csv file named "adj_matrix.csv" without header
adj_matrix_df = pd.DataFrame(adj_matrix)
adj_matrix_df.to_csv(f"{dir_name}/adj_matrix.csv", header=False, index=False)

# store the data as a csv file named "train.csv" without header
data.to_csv(f"{dir_name}/train.csv", header=False, index=False)

# store the weighted adjacency matrix as a csv file named "W.csv" without header
W_df = pd.DataFrame(W)
W_df.to_csv(f"{dir_name}/W_adj_matrix.csv", header=False, index=False)

In [None]:
# plot the distribution of all variables against each other and save the plot as "pairplot.pdf"
import seaborn as sns
sns.pairplot(data)
plt.savefig(f"{dir_name}/pairplot.pdf")

In [6]:
################################## MISC ##################################