In [1]:
# import pandas as pd

# # Load and save with proper encoding for Excel compatibility
# for file_name in ["result2.csv", "result3.csv"]:
#     # Load the CSV file
#     df = pd.read_csv(file_name)
    
#     # Save with UTF-8 encoding that Excel recognizes properly
#     df.to_csv(file_name, index=False, encoding="utf-8-sig")

# print("Files saved with proper encoding: result2.csv, result3.csv")

# file_1 = "result2.csv"
# file_2 = "result3.csv"

# # Read both CSVs
# df1 = pd.read_csv(file_1)
# df2 = pd.read_csv(file_2)

# # Merge them by concatenating rows
# merged_df = pd.concat([df1, df2], ignore_index=True)

# merged_df.head()

# merged_df.columns

# # Desired column order
# column_order = ['ntrials', 'n', 'd', 's0', 'gt', 'st', 'nh', 'l1', 'l2', 'wt', 'lr', 'ct', 'fdr', 'shd', 'tpr', 'nnz']

# # Reorder columns in the merged DataFrame
# merged_df = merged_df[column_order]

# merged_df.head()

# # Sort the DataFrame based on the specified columns
# sort_columns = ['ntrials', 'n', 'd', 's0', 'gt', 'st', 'nh', 'l1', 'l2', 'wt', 'lr', 'ct']
# sorted_df = merged_df.sort_values(by=sort_columns, ignore_index=True)

# sorted_df.head(20)

# # Save the sorted DataFrame with the name result2.csv
# sorted_df.to_csv("result2.csv", index=False, encoding="utf-8-sig")

# print("Sorted DataFrame saved as result2.csv")

In [2]:
## notears: input to output with fc1 and fc2
## [n, d] = [n, 10] 
##        = [n x 10] x [10 x 100] # this is fc1_pos also fc1_neg they have dimension [d, d*dims[1]] which is [10, 100]
##        = [n x 100]
##        = [n x 10 x 10] x [10 x 10 x 1] # this is fc2, has dimension [10, 10, 1], so for each feature it has separate [10, 1] weights
##        = [n x 10 x 1]
##        = [n x 10]
##        = [n, 10] = [n, d]

In [3]:
## notears + cglearn: how to separate 'd' predictors, so we have to separate the weights for each of 'd' targets
##
## for i in range(model.dims[0]):  # here we separate weight for 'i'-th target
##     pos_g = model.fc1_pos.weight.grad.view(model.dims[0], -1, model.dims[0])[i, :, :] # [target, m1, input]
##     neg_g = model.fc1_neg.weight.grad.view(model.dims[0], -1, model.dims[0])[i, :, :] # [target, m1, input]
##
## so now pos_g has weight shaped [m1, input] for each target, so now we have separated the target, so now focusing on only one target that is the i-th
## 
##     l2_norms_pos = torch.norm(pos_g, dim=0, p=2).detach() 
##     l2_norms_neg = torch.norm(neg_g, dim=0, p=2).detach() 
##
## taking L2-norm over 'm1', so now for each target we have the L2-norm for each input
##

In [4]:
###################################
## comment
###################################
# few things to check -
#         1. data generation => n, d, s0, graph_type, sem_type = 200, 10, 40, 'ER', 'mlp'
#         2. env generation =>  noise_scales = [0.2, 1, 2, 5, 10]
#         3. nhidden=10, lambda1=0.01, lambda2=0.01, w_threshold=0.3, std_lambda=60, learning_rate=0.001
#         4. in utils.count_accuracy() => is_dag() is disabled/enabled?

In [5]:
###################################
## install and import
###################################

In [6]:
# !pip install python-igraph
from notears.locally_connected import LocallyConnected
from notears.lbfgsb_scipy import LBFGSBScipy
from notears.trace_expm import trace_expm
import torch
import torch.nn as nn
import numpy as np
import math
import notears.utils as ut
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import pandas as pd

  from pandas.core import (


In [7]:
###################################
## class
###################################

In [8]:
class NotearsMLP(nn.Module):
    def __init__(self, dims, bias=True):
        super(NotearsMLP, self).__init__()
        assert len(dims) >= 2
        assert dims[-1] == 1
        d = dims[0]
        self.dims = dims

        # fc1: variable splitting for l1
        self.fc1_pos = nn.Linear(d, d * dims[1], bias=bias)
        self.fc1_neg = nn.Linear(d, d * dims[1], bias=bias)
        self.fc1_pos.weight.bounds = self._bounds()
        self.fc1_neg.weight.bounds = self._bounds()

        # fc2: Locally connected layers with BatchNorm
        layers = []
        for l in range(len(dims) - 2):
            layers.append(LocallyConnected(d, dims[l + 1], dims[l + 2], bias=bias))
        self.fc2 = nn.ModuleList(layers)

    def _bounds(self):
        d = self.dims[0]
        bounds = []
        for j in range(d):
            for m in range(self.dims[1]):
                for i in range(d):
                    if i == j:
                        bound = (0, 0)
                    else:
                        bound = (0, None)
                    bounds.append(bound)
        return bounds

    def forward(self, x):  # [n, d] -> [n, d]
        # Apply fc1 and normalize
        x = self.fc1_pos(x) - self.fc1_neg(x)  # [n, d * m1]
        x = x.view(-1, self.dims[0], self.dims[1])  # Reshape to [n, d, m1]

        # Apply fc2 layers and normalization
        for fc in self.fc2:
            x = torch.sigmoid(x)  # Activation
            x = fc(x)  # Locally connected layer output [n, d, m2]

        x = x.squeeze(dim=2)  # [n, d]
        return x

    def h_func(self):
        """Constrain 2-norm-squared of fc1 weights along m1 dim to be a DAG"""
        d = self.dims[0]
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        fc1_weight = fc1_weight.view(d, -1, d)  # [j, m1, i]
        A = torch.sum(fc1_weight * fc1_weight, dim=1).t()  # [i, j]
        h = trace_expm(A) - d  # (Zheng et al. 2018)
        # A different formulation, slightly faster at the cost of numerical stability
        # M = torch.eye(d) + A / d  # (Yu et al. 2019)
        # E = torch.matrix_power(M, d - 1)
        # h = (E.t() * M).sum() - d
        return h

    def l2_reg(self):
        """Take 2-norm-squared of all parameters"""
        reg = 0.
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        reg += torch.sum(fc1_weight ** 2)
        for fc in self.fc2:
            reg += torch.sum(fc.weight ** 2)
        return reg

    def fc1_l1_reg(self):
        """Take l1 norm of fc1 weight"""
        reg = torch.sum(self.fc1_pos.weight + self.fc1_neg.weight)
        return reg

    @torch.no_grad()
    def fc1_to_adj(self) -> np.ndarray:  # [j * m1, i] -> [i, j]
        """Get W from fc1 weights, take 2-norm over m1 dim"""
        d = self.dims[0]
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        fc1_weight = fc1_weight.view(d, -1, d)  # [j, m1, i]
        A = torch.sum(fc1_weight * fc1_weight, dim=1).t()  # [i, j]
        W = torch.sqrt(A)  # [i, j]
        W = W.cpu().detach().numpy()  # [i, j]
        return W

class NotearsMLP2(nn.Module):
    def __init__(self, dims, bias=True):
        super(NotearsMLP2, self).__init__()
        assert len(dims) >= 2
        assert dims[-1] == 1
        d = dims[0]
        self.dims = dims

        # fc1: variable splitting for l1
        self.fc1_pos = nn.Linear(d, d * dims[1], bias=bias)
        self.fc1_neg = nn.Linear(d, d * dims[1], bias=bias)
        self.fc1_pos.weight.bounds = self._bounds()
        self.fc1_neg.weight.bounds = self._bounds()

        # Layer normalization after fc1
        self.ln1 = nn.LayerNorm(d * dims[1])  # Normalize across feature dimensions

        # fc2: Locally connected layers with BatchNorm
        layers = []
        for l in range(len(dims) - 2):
            layers.append(LocallyConnected(d, dims[l + 1], dims[l + 2], bias=bias))
        self.fc2 = nn.ModuleList(layers)

    def _bounds(self):
        d = self.dims[0]
        bounds = []
        for j in range(d):
            for m in range(self.dims[1]):
                for i in range(d):
                    if i == j:
                        bound = (0, 0)
                    else:
                        bound = (0, None)
                    bounds.append(bound)
        return bounds
    
    def _apply_bounds(self):
        """Clip weights of fc1_pos and fc1_neg to stay within bounds."""
        with torch.no_grad():
            for layer, bounds in [(self.fc1_pos, self.fc1_pos.weight.bounds), (self.fc1_neg, self.fc1_neg.weight.bounds)]:
                # Apply bounds per weight
                for idx, (min_bound, max_bound) in enumerate(bounds):
                    if max_bound is not None:  # Clip only if max_bound is defined
                        layer.weight.data.view(-1)[idx].clamp_(min_bound, max_bound)    

    def forward(self, x):  # [n, d] -> [n, d]
        # Apply fc1 and normalize
        x = self.fc1_pos(x) - self.fc1_neg(x)  # [n, d * m1]
        self._apply_bounds()  # Enforce bounds on fc1 weights
        x = self.ln1(x)  # Apply LayerNorm after fc1
        x = x.view(-1, self.dims[0], self.dims[1])  # Reshape to [n, d, m1]

        # Apply fc2 layers and normalization
        for fc in self.fc2:
            x = torch.sigmoid(x)  # Activation
            x = fc(x)  # Locally connected layer output [n, d, m2]

        x = x.squeeze(dim=2)  # [n, d]
        return x

    def h_func(self):
        """Constrain 2-norm-squared of fc1 weights along m1 dim to be a DAG"""
        d = self.dims[0]
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        fc1_weight = fc1_weight.view(d, -1, d)  # [j, m1, i]
        A = torch.sum(fc1_weight * fc1_weight, dim=1).t()  # [i, j]
        h = trace_expm(A) - d  # (Zheng et al. 2018)
        # A different formulation, slightly faster at the cost of numerical stability
        # M = torch.eye(d) + A / d  # (Yu et al. 2019)
        # E = torch.matrix_power(M, d - 1)
        # h = (E.t() * M).sum() - d
        return h

    def l2_reg(self):
        """Take 2-norm-squared of all parameters"""
        reg = 0.
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        reg += torch.sum(fc1_weight ** 2)
        for fc in self.fc2:
            reg += torch.sum(fc.weight ** 2)
        return reg

    def fc1_l1_reg(self):
        """Take l1 norm of fc1 weight"""
        reg = torch.sum(self.fc1_pos.weight + self.fc1_neg.weight)
        return reg

    @torch.no_grad()
    def fc1_to_adj(self) -> np.ndarray:  # [j * m1, i] -> [i, j]
        """Get W from fc1 weights, take 2-norm over m1 dim"""
        d = self.dims[0]
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j * m1, i]
        fc1_weight = fc1_weight.view(d, -1, d)  # [j, m1, i]
        A = torch.sum(fc1_weight * fc1_weight, dim=1).t()  # [i, j]
        W = torch.sqrt(A)  # [i, j]
        W = W.cpu().detach().numpy()  # [i, j]
        return W

class NotearsSobolev(nn.Module):
    def __init__(self, d, k):
        """d: num variables k: num expansion of each variable"""
        super(NotearsSobolev, self).__init__()
        self.d, self.k = d, k
        self.fc1_pos = nn.Linear(d * k, d, bias=False)  # ik -> j
        self.fc1_neg = nn.Linear(d * k, d, bias=False)
        self.fc1_pos.weight.bounds = self._bounds()
        self.fc1_neg.weight.bounds = self._bounds()
        nn.init.zeros_(self.fc1_pos.weight)
        nn.init.zeros_(self.fc1_neg.weight)
        self.l2_reg_store = None

    def _bounds(self):
        # weight shape [j, ik]
        bounds = []
        for j in range(self.d):
            for i in range(self.d):
                for _ in range(self.k):
                    if i == j:
                        bound = (0, 0)
                    else:
                        bound = (0, None)
                    bounds.append(bound)
        return bounds

    def sobolev_basis(self, x):  # [n, d] -> [n, dk]
        seq = []
        for kk in range(self.k):
            mu = 2.0 / (2 * kk + 1) / math.pi  # sobolev basis
            psi = mu * torch.sin(x / mu)
            seq.append(psi)  # [n, d] * k
        bases = torch.stack(seq, dim=2)  # [n, d, k]
        bases = bases.view(-1, self.d * self.k)  # [n, dk]
        return bases

    def forward(self, x):  # [n, d] -> [n, d]
        bases = self.sobolev_basis(x)  # [n, dk]
        x = self.fc1_pos(bases) - self.fc1_neg(bases)  # [n, d]
        self.l2_reg_store = torch.sum(x ** 2) / x.shape[0]
        return x

    def h_func(self):
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j, ik]
        fc1_weight = fc1_weight.view(self.d, self.d, self.k)  # [j, i, k]
        A = torch.sum(fc1_weight * fc1_weight, dim=2).t()  # [i, j]
        h = trace_expm(A) - d  # (Zheng et al. 2018)
        # A different formulation, slightly faster at the cost of numerical stability
        # M = torch.eye(self.d) + A / self.d  # (Yu et al. 2019)
        # E = torch.matrix_power(M, self.d - 1)
        # h = (E.t() * M).sum() - self.d
        return h

    def l2_reg(self):
        reg = self.l2_reg_store
        return reg

    def fc1_l1_reg(self):
        reg = torch.sum(self.fc1_pos.weight + self.fc1_neg.weight)
        return reg

    @torch.no_grad()
    def fc1_to_adj(self) -> np.ndarray:
        fc1_weight = self.fc1_pos.weight - self.fc1_neg.weight  # [j, ik]
        fc1_weight = fc1_weight.view(self.d, self.d, self.k)  # [j, i, k]
        A = torch.sum(fc1_weight * fc1_weight, dim=2).t()  # [i, j]
        W = torch.sqrt(A)  # [i, j]
        W = W.cpu().detach().numpy()  # [i, j]
        return W

In [9]:
###################################
## function
###################################

In [10]:
def squared_loss(output, target):
    n = target.shape[0]
    loss = 0.5 / n * torch.sum((output - target) ** 2)
    return loss

def notears_nonlinear(model: nn.Module,
                      X: np.ndarray,
                      lambda1: float = 0.,
                      lambda2: float = 0.,
                      max_iter: int = 100,
                      h_tol: float = 1e-8,
                      rho_max: float = 1e+16,
                      w_threshold: float = 0.3):
    model.w_threshold = w_threshold
    rho, alpha, h = 1.0, 0.0, np.inf
    for _ in range(max_iter):
        rho, alpha, h = dual_ascent_step(model, X, lambda1, lambda2,
                                         rho, alpha, h, rho_max)
        if h <= h_tol or rho >= rho_max:
            break
    W_est = model.fc1_to_adj()
    W_est[np.abs(W_est) < w_threshold] = 0
    return W_est

def notears_nonlinear_with_loss_std(model: nn.Module,
                                    X_list: list,  
                                    lambda1: float = 0.0,
                                    lambda2: float = 0.0,
                                    max_iter: int = 100,
                                    h_tol: float = 1e-8,
                                    rho_max: float = 1e+16,
                                    w_threshold: float = 0.3,
                                    std_lambda: float = 1.0,
                                    lr: float = 1e-3
                                   ):  
    model.w_threshold = w_threshold    
    rho, alpha, h = 1.0, 0.0, np.inf
    for iter_no in range(max_iter):
        rho, alpha, h = dual_ascent_step_with_loss_std(
            model, X_list, lambda1, lambda2, std_lambda, rho, alpha, h, rho_max, iter_no, lr=lr)
        if h <= h_tol or rho >= rho_max:
            break
    W_est = model.fc1_to_adj()
    W_est[np.abs(W_est) < w_threshold] = 0
    return W_est

def dual_ascent_step(model, X, lambda1, lambda2, rho, alpha, h, rho_max):
    """Perform one step of dual ascent in augmented Lagrangian."""
    h_new = None
    optimizer = LBFGSBScipy(model.parameters())
    X_torch = torch.from_numpy(X)
    while rho < rho_max:
        def closure():
            optimizer.zero_grad()
            X_hat = model(X_torch)
            loss = squared_loss(X_hat, X_torch)
            h_val = model.h_func()
            penalty = 0.5 * rho * h_val * h_val + alpha * h_val
            l2_reg = 0.5 * lambda2 * model.l2_reg()
            l1_reg = lambda1 * model.fc1_l1_reg()
            primal_obj = loss + penalty + l2_reg + l1_reg
            # primal_obj = loss + penalty
            primal_obj.backward()
            return primal_obj
        optimizer.step(closure)  # NOTE: updates model in-place
        with torch.no_grad():
            h_new = model.h_func().item()
        if h_new > 0.25 * h:
            rho *= 10
        else:
            break
    alpha += rho * h_new
    return rho, alpha, h_new

def bak_dual_ascent_step_with_loss_std(model, X_list, lambda1, lambda2, std_lambda, rho, alpha, h, rho_max, iter_no, lr=0.001):
    """Perform one step of dual ascent in augmented Lagrangian, with consistent gradient-based learning (CGLearn) for each predictor."""
    
    crp = std_lambda  # Consistency ratio percentile
    h_new = None
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # Using Adam optimizer instead of LBFGSBScipy
    X_tensors = [torch.from_numpy(X).float().to(torch.double).to(model.fc1_pos.weight.device) for X in X_list]  # Ensure tensors are on the correct device
    
    while rho < rho_max:
        feature_l2_norms_pos_per_predictor = [[] for _ in range(model.dims[0])]  # Separate L2 norms for fc1_pos
        feature_l2_norms_neg_per_predictor = [[] for _ in range(model.dims[0])]  # Separate L2 norms for fc1_neg
        list_all_grads = []  # To store gradients for the entire model (all layers)

        # Step 1: Compute loss and gradients for each environment
        for X in X_tensors:
            # Loss calculation
            X_hat = model(X)
            loss_mse = squared_loss(X_hat, X)
            h_val = model.h_func()
            penalty = 0.5 * rho * h_val * h_val + alpha * h_val
            l2_reg = 0.5 * lambda2 * model.l2_reg()
            l1_reg = lambda1 * model.fc1_l1_reg()
            final_loss = (loss_mse + penalty + l2_reg + l1_reg) / len(X_tensors)  # Averaged across environments

            # Grad calculation
            optimizer.zero_grad()
            final_loss.backward()

            # Collect all gradients for the entire model (for all layers)
            grads = []
            for param in model.parameters():
                grads.append(param.grad.clone().flatten())
            all_grads = torch.cat(grads)  # Flatten and concatenate all gradients for all parameters
            list_all_grads.append(all_grads)  # Store gradients for this dataset

            # Compute and store gradients per predictor for the first hidden layer
            for i in range(model.dims[0]):  # Iterate over each predictor (input feature)
                # Get the gradients for the first hidden layer (fc1_pos and fc1_neg)
                pos_g = model.fc1_pos.weight.grad.view(model.dims[0], -1, model.dims[0])[i, :, :]  # Shape [5, 10]
                neg_g = model.fc1_neg.weight.grad.view(model.dims[0], -1, model.dims[0])[i, :, :]  # Shape [5, 10]
                
                ## 
                
                # Compute L2 norms of the gradients across the hidden neurons for each feature
                l2_norms_pos = torch.norm(pos_g, dim=0, p=2).detach()  # L2 norm across hidden neurons, result shape: [10]
                l2_norms_neg = torch.norm(neg_g, dim=0, p=2).detach()  # Same for fc1_neg

                # Store L2 norms for each predictor
                feature_l2_norms_pos_per_predictor[i].append(l2_norms_pos)
                feature_l2_norms_neg_per_predictor[i].append(l2_norms_neg)

        # Step 2: Compute mean of all gradients for the entire model across all datasets
        list_all_grads = torch.stack(list_all_grads)  # Stack all gradients for all datasets
        mean_all_grads = torch.mean(list_all_grads, dim=0)  # Mean gradient for the entire model

        # Step 3: Gradient consistency calculations (CGLearn) per predictor
        consistency_masks_pos = []
        consistency_masks_neg = []
        
        for i in range(model.dims[0]):  # Iterate over each predictor (input feature)
            # Consistency check for fc1_pos weights of predictor i
            feature_l2_norms_pos = torch.stack(feature_l2_norms_pos_per_predictor[i])  # Stack L2 norms across datasets
            mean_norms_pos = torch.mean(feature_l2_norms_pos, dim=0)  # Mean of L2 norms for each feature
            std_norms_pos = torch.std(feature_l2_norms_pos, dim=0) + 1e-8  # Standard deviation of L2 norms for each feature
            cr_pos = torch.abs(mean_norms_pos) / std_norms_pos  # Consistency ratio
            ct_pos = np.percentile(cr_pos.cpu().numpy(), crp)  # Threshold based on the percentile
            consistency_mask_pos = torch.where(cr_pos >= ct_pos, torch.tensor(1., device=model.fc1_pos.weight.device), torch.tensor(0., device=model.fc1_pos.weight.device))            
            consistency_masks_pos.append(consistency_mask_pos.repeat(model.dims[1], 1))  # Broadcast the mask
            
            # Consistency check for fc1_neg weights of predictor i
            feature_l2_norms_neg = torch.stack(feature_l2_norms_neg_per_predictor[i])  # Same for fc1_neg
            mean_norms_neg = torch.mean(feature_l2_norms_neg, dim=0)
            std_norms_neg = torch.std(feature_l2_norms_neg, dim=0) + 1e-8
            cr_neg = torch.abs(mean_norms_neg) / std_norms_neg  # Consistency ratio
            ct_neg = np.percentile(cr_neg.cpu().numpy(), crp)
            consistency_mask_neg = torch.where(cr_neg >= ct_neg, torch.tensor(1., device=model.fc1_neg.weight.device), torch.tensor(0., device=model.fc1_neg.weight.device))
            consistency_masks_neg.append(consistency_mask_neg.repeat(model.dims[1], 1))
            
        # Step 4: Apply the masks to gradients before updating parameters
        cmp = torch.stack(consistency_masks_pos).view(-1, model.dims[0])     
        cmn = torch.stack(consistency_masks_neg).view(-1, model.dims[0])
        
        start_index = 0
        for name, param in model.named_parameters():
            param_numel = param.numel()
            mean_grad = mean_all_grads[start_index: start_index + param_numel].view_as(param)
            if 'fc1_pos.weight' in name:
                param.grad = mean_grad * cmp       
            elif 'fc1_neg.weight' in name:
                param.grad = mean_grad * cmn    
            else:
                param.grad = mean_grad
            start_index += param_numel

        # Step 5: Now that the gradients have been modified, perform the optimization step
        optimizer.step()

        # Step 6: Check convergence and adjust rho
        with torch.no_grad():
            h_new = model.h_func().item()
        if h_new > 0.25 * h:
            rho *= 10
        else:
            break
            
        # ww = model.fc1_to_adj()
        # ww[np.abs(ww) < model.w_threshold] = 0
        # print(ww)

    alpha += rho * h_new
    return rho, alpha, h_new

def dual_ascent_step_with_loss_std(model, X_list, lambda1, lambda2, std_lambda, rho, alpha, h, rho_max, iter_no, lr=0.001):
    """Perform one step of dual ascent in augmented Lagrangian, with consistent gradient-based learning (CGLearn) for each predictor."""
    
    crp = std_lambda  # Consistency ratio percentile
    h_new = None
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # Using Adam optimizer instead of LBFGSBScipy
    X_tensors = [torch.from_numpy(X).float().to(torch.double).to(model.fc1_pos.weight.device) for X in X_list]  # Ensure tensors are on the correct device
    
    while rho < rho_max:
        list_all_grads = []  # To store gradients for the entire model (all layers)

        # Step 1: Compute loss and gradients for each environment
        for X in X_tensors:
            # Loss calculation
            X_hat = model(X)
            loss_mse = squared_loss(X_hat, X)
            h_val = model.h_func()
            penalty = 0.5 * rho * h_val * h_val + alpha * h_val
            l2_reg = 0.5 * lambda2 * model.l2_reg()
            l1_reg = lambda1 * model.fc1_l1_reg()
            final_loss = (loss_mse + penalty + l2_reg + l1_reg) / len(X_tensors)  # Averaged across environments
            # Grad calculation
            optimizer.zero_grad()
            final_loss.backward()
            # Collect all gradients for the entire model (for all layers)
            grads = []
            for param in model.parameters():
                grads.append(param.grad.clone().flatten())
            all_grads = torch.cat(grads)  # Flatten and concatenate all gradients for all parameters
            list_all_grads.append(all_grads)  # Store gradients for this dataset
            
        # Step 2: Compute consistency mask
        list_all_grads = torch.stack(list_all_grads)  # Stack all gradients for all datasets
        mean_all_grads = torch.mean(list_all_grads, dim=0)  # Mean gradient for the entire model
        std_all_grads = torch.std(list_all_grads, dim=0) + 1e-8  # StdDev gradient for the entire model
        cr_all_grads = torch.abs(mean_all_grads) / std_all_grads 
        ct_all_grads = np.percentile(cr_all_grads.cpu().numpy(), crp) 
        cm_all_grads = torch.where(cr_all_grads >= ct_all_grads, torch.tensor(1., device=model.fc1_pos.weight.device), torch.tensor(0., device=model.fc1_pos.weight.device))            

        # step 3: update the mean gradient based on consistency
        updated_mean_all_grads = mean_all_grads * cm_all_grads 

        # step 4: update the parameters with updated mean gradient
        start_index = 0
        for name, param in model.named_parameters():
            param_numel = param.numel()
            mean_grad = updated_mean_all_grads[start_index: start_index + param_numel].view_as(param)
            if 'fc1_pos.weight' in name:
                param.grad = mean_grad       
            elif 'fc1_neg.weight' in name:
                param.grad = mean_grad    
            else:
                param.grad = mean_grad
            start_index += param_numel

        # Step 5: Now that the gradients have been modified, perform the optimization step
        optimizer.step()

        # Step 6: Check convergence and adjust rho
        with torch.no_grad():
            h_new = model.h_func().item()
        if h_new > 0.25 * h:
            rho *= 10
        else:
            break
            
        # ww = model.fc1_to_adj()
        # ww[np.abs(ww) < model.w_threshold] = 0
        # print(ww)

    alpha += rho * h_new
    return rho, alpha, h_new


In [11]:
## multiple runs

In [12]:
torch.set_default_dtype(torch.double)
np.set_printoptions(precision=3)
list_res = []
column_titles = ["ntrials", "sample", "node", "edge", "graph", "SEM", "hiddenU", "l1", "l2", "wthresh", "lr", "cthresh", "fdr", "shd", "tpr", "nnz"]
df = pd.DataFrame(list_res, columns=column_titles)
df.to_csv("result2.csv", index=False, encoding="utf-8-sig")  
print("CSV saved as result2.csv")
##############################################################
########## set parameters for experiments
ntrials = 5
list_opt = [ ## + check number of trials
    (200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.001),
    (200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01),
    (200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05),
    (200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.1),
    (200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.001),
    (200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01),
    (200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05),
    (200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.1),
    (200, 20, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.001),
    (200, 20, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01),
    (200, 20, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05),
    (200, 20, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.1),
    (200, 20, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.001),
    (200, 20, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01),
    (200, 20, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05),
    (200, 20, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.1),    
]
##############################################################
for opt in list_opt:
    n, d, _s0, graph_type, sem_type, nh, l1, l2, wt, lr = opt
    s0 = d*_s0
    list_fdr, list_shd, list_tpr, list_nnz = [], [], [], []
    list_fdr2, list_shd2, list_tpr2, list_nnz2 = [], [], [], []    
    list_fdr3, list_shd3, list_tpr3, list_nnz3 = [], [], [], []    
    list_fdr4, list_shd4, list_tpr4, list_nnz4 = [], [], [], []    
    list_fdr5, list_shd5, list_tpr5, list_nnz5 = [], [], [], []        
    for tn in range(ntrials):
        ##
        ## initialize
        ##
        ut.set_random_seed(123+tn)
        B_true = ut.simulate_dag(d, s0, graph_type)
        np.savetxt('inputs/W_true.csv', B_true, delimiter=',')
        # noise_scale = np.ones(d)
        # X = ut.simulate_nonlinear_sem(B_true, n, sem_type, noise_scale)
        # np.savetxt('X.csv', X, delimiter=',')
        noise_scales = [0.2, 1, 2, 5, 10]
        for i, noise_scale_value in enumerate(noise_scales):
            noise_scale = np.full(d, noise_scale_value)  
            X = ut.simulate_nonlinear_sem(B_true, n, sem_type, noise_scale)
            np.savetxt(f'inputs/X_{i}.csv', X, delimiter=',')  

        X_0 = np.loadtxt('inputs/X_0.csv', delimiter=',')
        X_1 = np.loadtxt('inputs/X_1.csv', delimiter=',')
        X_2 = np.loadtxt('inputs/X_2.csv', delimiter=',')
        X_3 = np.loadtxt('inputs/X_3.csv', delimiter=',')
        X_4 = np.loadtxt('inputs/X_4.csv', delimiter=',')
        X_list = [X_0, X_1, X_2, X_3, X_4]  # List of datasets
        scaler = StandardScaler()
        X_list_standardized = [scaler.fit_transform(X) for X in X_list] ## separate fit_transform cause each dataset with different scale (diff noise scale)
        X_combined = np.vstack([X for X in X_list_standardized]) 
        ##
        ## notears
        ##
        model = NotearsMLP(dims=[d, nh, 1], bias=True)
        W_est = notears_nonlinear(model, X_combined, lambda1=l1, lambda2=l2, w_threshold=wt)
        # assert ut.is_dag(W_est)
        np.savetxt('outputs/W_est.csv', W_est, delimiter=',')
        acc = ut.count_accuracy(B_true, W_est != 0)
        list_fdr.append(acc['fdr'])
        list_shd.append(acc['shd'])
        list_tpr.append(acc['tpr'])
        list_nnz.append(acc['nnz'])   
        ##
        ## inotears
        ## 2
        model = NotearsMLP2(dims=[d, nh, 1], bias=True)
        W_est = notears_nonlinear_with_loss_std(model, X_list_standardized, lambda1=l1, lambda2=l2, std_lambda=20, w_threshold=wt, lr=lr)
        # assert ut.is_dag(W_est)
        np.savetxt('outputs/W_est2.csv', W_est, delimiter=',')
        acc = ut.count_accuracy(B_true, W_est != 0)
        list_fdr2.append(acc['fdr'])
        list_shd2.append(acc['shd'])
        list_tpr2.append(acc['tpr'])
        list_nnz2.append(acc['nnz'])        
        ## 3
        model = NotearsMLP2(dims=[d, nh, 1], bias=True)
        W_est = notears_nonlinear_with_loss_std(model, X_list_standardized, lambda1=l1, lambda2=l2, std_lambda=40, w_threshold=wt, lr=lr)
        # assert ut.is_dag(W_est)
        np.savetxt('outputs/W_est3.csv', W_est, delimiter=',')
        acc = ut.count_accuracy(B_true, W_est != 0)
        list_fdr3.append(acc['fdr'])
        list_shd3.append(acc['shd'])
        list_tpr3.append(acc['tpr'])
        list_nnz3.append(acc['nnz'])        
        ## 4
        model = NotearsMLP2(dims=[d, nh, 1], bias=True)
        W_est = notears_nonlinear_with_loss_std(model, X_list_standardized, lambda1=l1, lambda2=l2, std_lambda=60, w_threshold=wt, lr=lr)
        # assert ut.is_dag(W_est)
        np.savetxt('outputs/W_est4.csv', W_est, delimiter=',')
        acc = ut.count_accuracy(B_true, W_est != 0)
        list_fdr4.append(acc['fdr'])
        list_shd4.append(acc['shd'])
        list_tpr4.append(acc['tpr'])
        list_nnz4.append(acc['nnz'])        
        ## 5
        model = NotearsMLP2(dims=[d, nh, 1], bias=True)
        W_est = notears_nonlinear_with_loss_std(model, X_list_standardized, lambda1=l1, lambda2=l2, std_lambda=80, w_threshold=wt, lr=lr)
        # assert ut.is_dag(W_est)
        np.savetxt('outputs/W_est5.csv', W_est, delimiter=',')
        acc = ut.count_accuracy(B_true, W_est != 0)
        list_fdr5.append(acc['fdr'])
        list_shd5.append(acc['shd'])
        list_tpr5.append(acc['tpr'])
        list_nnz5.append(acc['nnz'])        

    print()
    print()
    print('####################################################################')
    print(opt)
    print(1)
    print(f'FDR: {np.mean(list_fdr):.4f} ± {np.std(list_fdr):.4f}')
    print(f'SHD: {np.mean(list_shd):.4f} ± {np.std(list_shd):.4f}')
    print(f'TPR: {np.mean(list_tpr):.4f} ± {np.std(list_tpr):.4f}')
    print(f'NNZ: {np.mean(list_nnz):.4f} ± {np.std(list_nnz):.4f}')
    res = (
        str(ntrials), str(n), str(d), str(s0), str(graph_type), str(sem_type), str(nh), str(l1), str(l2), str(wt), str('NA'), 
        str('reg Notears'), 
        f'{np.mean(list_fdr):.4f} ± {np.std(list_fdr):.4f}', 
        f'{np.mean(list_shd):.4f} ± {np.std(list_shd):.4f}',
        f'{np.mean(list_tpr):.4f} ± {np.std(list_tpr):.4f}',
        f'{np.mean(list_nnz):.4f} ± {np.std(list_nnz):.4f}'
    )
    list_res.append(res)
    print()
    
    print(2)
    print(f'FDR: {np.mean(list_fdr2):.4f} ± {np.std(list_fdr2):.4f}')
    print(f'SHD: {np.mean(list_shd2):.4f} ± {np.std(list_shd2):.4f}')
    print(f'TPR: {np.mean(list_tpr2):.4f} ± {np.std(list_tpr2):.4f}')
    print(f'NNZ: {np.mean(list_nnz2):.4f} ± {np.std(list_nnz2):.4f}')
    res = (
        str(ntrials), str(n), str(d), str(s0), str(graph_type), str(sem_type), str(nh), str(l1), str(l2), str(wt), str(lr), 
        str(20), 
        f'{np.mean(list_fdr2):.4f} ± {np.std(list_fdr2):.4f}', 
        f'{np.mean(list_shd2):.4f} ± {np.std(list_shd2):.4f}',
        f'{np.mean(list_tpr2):.4f} ± {np.std(list_tpr2):.4f}',
        f'{np.mean(list_nnz2):.4f} ± {np.std(list_nnz2):.4f}'
    )
    list_res.append(res)    
    print()
    
    print(3)
    print(f'FDR: {np.mean(list_fdr3):.4f} ± {np.std(list_fdr3):.4f}')
    print(f'SHD: {np.mean(list_shd3):.4f} ± {np.std(list_shd3):.4f}')
    print(f'TPR: {np.mean(list_tpr3):.4f} ± {np.std(list_tpr3):.4f}')
    print(f'NNZ: {np.mean(list_nnz3):.4f} ± {np.std(list_nnz3):.4f}')
    res = (
        str(ntrials), str(n), str(d), str(s0), str(graph_type), str(sem_type), str(nh), str(l1), str(l2), str(wt), str(lr), 
        str(40), 
        f'{np.mean(list_fdr3):.4f} ± {np.std(list_fdr3):.4f}', 
        f'{np.mean(list_shd3):.4f} ± {np.std(list_shd3):.4f}',
        f'{np.mean(list_tpr3):.4f} ± {np.std(list_tpr3):.4f}',
        f'{np.mean(list_nnz3):.4f} ± {np.std(list_nnz3):.4f}'
    )
    list_res.append(res)    
    print()
    
    print(4)
    print(f'FDR: {np.mean(list_fdr4):.4f} ± {np.std(list_fdr4):.4f}')
    print(f'SHD: {np.mean(list_shd4):.4f} ± {np.std(list_shd4):.4f}')
    print(f'TPR: {np.mean(list_tpr4):.4f} ± {np.std(list_tpr4):.4f}')
    print(f'NNZ: {np.mean(list_nnz4):.4f} ± {np.std(list_nnz4):.4f}')
    res = (
        str(ntrials), str(n), str(d), str(s0), str(graph_type), str(sem_type), str(nh), str(l1), str(l2), str(wt), str(lr), 
        str(60), 
        f'{np.mean(list_fdr4):.4f} ± {np.std(list_fdr4):.4f}', 
        f'{np.mean(list_shd4):.4f} ± {np.std(list_shd4):.4f}',
        f'{np.mean(list_tpr4):.4f} ± {np.std(list_tpr4):.4f}',
        f'{np.mean(list_nnz4):.4f} ± {np.std(list_nnz4):.4f}'
    )
    list_res.append(res)    
    print()
    
    print(5)
    print(f'FDR: {np.mean(list_fdr5):.4f} ± {np.std(list_fdr5):.4f}')
    print(f'SHD: {np.mean(list_shd5):.4f} ± {np.std(list_shd5):.4f}')
    print(f'TPR: {np.mean(list_tpr5):.4f} ± {np.std(list_tpr5):.4f}')
    print(f'NNZ: {np.mean(list_nnz5):.4f} ± {np.std(list_nnz5):.4f}')
    res = (
        str(ntrials), str(n), str(d), str(s0), str(graph_type), str(sem_type), str(nh), str(l1), str(l2), str(wt), str(lr), 
        str(80), 
        f'{np.mean(list_fdr5):.4f} ± {np.std(list_fdr5):.4f}', 
        f'{np.mean(list_shd5):.4f} ± {np.std(list_shd5):.4f}',
        f'{np.mean(list_tpr5):.4f} ± {np.std(list_tpr5):.4f}',
        f'{np.mean(list_nnz5):.4f} ± {np.std(list_nnz5):.4f}'
    )
    list_res.append(res)    
    print()
    print('####################################################################')
    print()
    print()
    df = pd.DataFrame(list_res, columns=column_titles)
    df.to_csv("result2.csv", index=False, encoding="utf-8-sig") 
    print("CSV saved as result2.csv")    

CSV saved as result2.csv


####################################################################
(200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.001)
1
FDR: 0.5670 ± 0.0442
SHD: 10.4000 ± 0.8000
TPR: 0.3200 ± 0.1470
NNZ: 7.4000 ± 3.1369

2
FDR: 0.8884 ± 0.0010
SHD: 45.0000 ± 0.0000
TPR: 1.0000 ± 0.0000
NNZ: 89.6000 ± 0.8000

3
FDR: 0.8884 ± 0.0010
SHD: 45.0000 ± 0.0000
TPR: 1.0000 ± 0.0000
NNZ: 89.6000 ± 0.8000

4
FDR: 0.8889 ± 0.0000
SHD: 45.0000 ± 0.0000
TPR: 1.0000 ± 0.0000
NNZ: 90.0000 ± 0.0000

5
FDR: 0.8886 ± 0.0005
SHD: 45.0000 ± 0.0000
TPR: 1.0000 ± 0.0000
NNZ: 89.8000 ± 0.4000

####################################################################


CSV saved as result2.csv


  eAw = eAw @ eAw
  eAw = eAw @ eAw




####################################################################
(200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01)
1
FDR: 0.5593 ± 0.0513
SHD: 10.2000 ± 0.7483
TPR: 0.3200 ± 0.1470
NNZ: 7.2000 ± 2.7857

2
FDR: 0.0000 ± 0.0000
SHD: 10.0000 ± 0.0000
TPR: 0.0000 ± 0.0000
NNZ: 0.0000 ± 0.0000

3
FDR: 0.2000 ± 0.4000
SHD: 10.2000 ± 0.4000
TPR: 0.0000 ± 0.0000
NNZ: 0.2000 ± 0.4000

4
FDR: 0.5000 ± 0.4472
SHD: 9.8000 ± 0.9798
TPR: 0.0600 ± 0.0800
NNZ: 1.4000 ± 0.8000

5
FDR: 0.8654 ± 0.0104
SHD: 43.2000 ± 1.1662
TPR: 0.9400 ± 0.0490
NNZ: 70.0000 ± 1.8974

####################################################################


CSV saved as result2.csv


####################################################################
(200, 10, 1, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05)
1
FDR: 0.5760 ± 0.0420
SHD: 10.4000 ± 0.8000
TPR: 0.3000 ± 0.1095
NNZ: 7.2000 ± 2.7857

2
FDR: 1.0000 ± 0.0000
SHD: 14.8000 ± 1.6000
TPR: 0.0000 ± 0.0000
NNZ: 4.8000 ± 1.6000

3
FDR: 0.2000 ± 0.4000
SHD: 10.8000 ± 1.

  eAw = eAw @ eAw
  eAw = eAw @ eAw




####################################################################
(200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.01)
1
FDR: 0.5188 ± 0.1301
SHD: 34.2000 ± 1.9391
TPR: 0.1850 ± 0.0515
NNZ: 15.6000 ± 2.7276

2
FDR: 0.2000 ± 0.4000
SHD: 40.0000 ± 0.0000
TPR: 0.0000 ± 0.0000
NNZ: 0.2000 ± 0.4000

3
FDR: 0.0000 ± 0.0000
SHD: 40.0000 ± 0.0000
TPR: 0.0000 ± 0.0000
NNZ: 0.0000 ± 0.0000

4
FDR: 0.0000 ± 0.0000
SHD: 40.0000 ± 0.0000
TPR: 0.0000 ± 0.0000
NNZ: 0.0000 ± 0.0000

5
FDR: 0.5353 ± 0.0663
SHD: 36.4000 ± 4.9639
TPR: 0.8300 ± 0.1155
NNZ: 71.6000 ± 4.3635

####################################################################


CSV saved as result2.csv


####################################################################
(200, 10, 4, 'ER', 'mlp', 10, 0.01, 0.01, 0.3, 0.05)
1
FDR: 0.5250 ± 0.1130
SHD: 34.2000 ± 1.9391
TPR: 0.1850 ± 0.0515
NNZ: 15.6000 ± 2.2450

2
FDR: 1.0000 ± 0.0000
SHD: 41.6000 ± 0.8000
TPR: 0.0000 ± 0.0000
NNZ: 1.6000 ± 0.8000

3
FDR: 0.0000 ± 0.0000
SHD: 40.0000 ±

In [13]:
print('done')

done
