In [1]:
import numpy as np
import pickle as pkl
import torch
from torch import nn

np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
torch.backends.cudnn.deterministic = True

# Generate synthetic data using a line graph

Here, we generate the synthetic data using a line graph which consists of $N+2$ points (with additional $2$ endpoints since we are using a neighborhood size of $2$). The treatments $T$, covariates $\boldsymbol{X}$, and unobserved confounders $U$ are generated as follows:

\begin{align*}
\boldsymbol{X}_i &\sim \mathcal{N}(\boldsymbol{0}, \sigma_{X}^2 \boldsymbol{I}) \\
\{U\}_{i=1}^{N} &\sim \mathcal{N}_{N}(\boldsymbol{0}_{N}, \boldsymbol{D}_N) \\
T_i | \boldsymbol{X}_i, U_i &\sim f_{\theta} (\boldsymbol{X}_i, U_i) > 0 \\
Y_i | \boldsymbol{T}_i, \boldsymbol{X}_i, U_i &\sim \beta T_i + g_{\theta_T}(d_i \odot \boldsymbol{T}_{-i}) + g_{\theta_X}(\boldsymbol{X}_i) + U_i + \epsilon
\end{align*}

where $\boldsymbol{0}_{N}$ is a zero vector of length $N$, $\boldsymbol{D}_N$ is an $N \times N$ matrix with $\boldsymbol{D}_{ij} = \frac{1}{\sigma_U \sqrt{2\pi}} \exp⁡\left(-\frac{d_{ij}}{2l^2}\right)$, and $d_i$ is the $i^{th}$ row of $\boldsymbol{D}_N$. $T_i$ must be greater than zero to satisfy the positivity assumption. $\boldsymbol{T}_{-i}$ represents the neighboring treatment assignments of the unit $i$ (excluding unit $i$) depending on the neighborhood size.

In [2]:
N = 500
x_dim = 4
sigma_x, lengthscale_u, sigma_u = 1., 0.5, 0.5
noise_scale = 0.1

# Generate a total of N + 2 samples to account for endpoints
s1, s2 = np.linspace(0, 1, N+2), np.linspace(0, 1, N+2)
s_grid = np.meshgrid(s1, s2)
D_n = np.exp(-np.abs(s_grid[1]-s_grid[0])/(2*lengthscale_u**2)) / (np.sqrt(2*np.pi)*sigma_u)
X = np.random.multivariate_normal(np.zeros(x_dim), np.eye(x_dim)*sigma_x**2, size=N+2)
U = np.random.multivariate_normal(np.zeros(N+2), D_n, size=1).T

# Use nonlinear mapping to generate T given X, U
hidden_dim_f = 10
f = nn.Sequential(
    nn.Linear(x_dim+1, hidden_dim_f), 
    nn.LeakyReLU(), 
    nn.Linear(hidden_dim_f, 1), 
    nn.Softplus()
)
with torch.no_grad():
    T = f(torch.tensor(np.concatenate([X,U],axis=1)).float()).numpy()

# Use nonlinear mapping to generate Y given T, T_bar, X, U
neighborhood_size = 1
hidden_dim_g_T, hidden_dim_g_X = 16, 16
g_T = nn.Sequential(
    nn.Linear(2*neighborhood_size, hidden_dim_g_T), 
    nn.LeakyReLU(), 
    nn.Linear(hidden_dim_g_T, hidden_dim_g_T), 
    nn.LeakyReLU(), 
    nn.Linear(hidden_dim_g_T, 1)
)
g_X = nn.Sequential(
    nn.Linear(x_dim, hidden_dim_g_X), 
    nn.LeakyReLU(), 
    nn.Linear(hidden_dim_g_X, hidden_dim_g_X), 
    nn.LeakyReLU(), 
    nn.Linear(hidden_dim_g_X, 1)
)
# Shape of T_bar is (N, neighborhood_size*2)
T_bar_left = np.array([D_n[i+1,i+1-neighborhood_size:i+1]*T[i+1-neighborhood_size:i+1,0] for i in range(N)])
T_bar_right = np.array([D_n[i+1,i+2:i+2+neighborhood_size]*T[i+2:i+2+neighborhood_size,0] for i in range(N)])
T_bar = np.concatenate([T_bar_left,T_bar_right], axis=1)
W = np.array([D_n[i+1,i+1-neighborhood_size:i+2+neighborhood_size] for i in range(N)])
with torch.no_grad():
    Y_t_bar = g_T(torch.tensor(T_bar).float()).numpy()
    Y_t_bar_0 = g_T(torch.zeros_like(torch.tensor(T_bar)).float()).numpy()
    Y_x = g_X(torch.tensor(X).float()).numpy()
# Truncate T, X, and U to the same size as T_bar
T, X, U, Y_x = T[1:N+1], X[1:N+1], U[1:N+1], Y_x[1:N+1]
# Generate Y
beta = np.random.rand()
noise = np.random.normal(loc=0.0, scale=noise_scale, size=N).reshape(-1,1)
Y_00 = Y_t_bar_0 + Y_x + U + noise   
Y_01 = Y_t_bar + Y_x + U + noise
Y_10 = beta*T + Y_t_bar_0 + Y_x + U + noise
Y_11 = beta*T + Y_t_bar + Y_x + U + noise

# Calculate ground truth of direct and indirect effects
de_0, de_1 = np.mean(Y_10 - Y_00), np.mean(Y_11 - Y_01)
ie_0, ie_1 = np.mean(Y_01 - Y_00), np.mean(Y_11 - Y_10)
te = np.mean(Y_11 - Y_00)

data = {
    "neighborhood_size": neighborhood_size, 
    "T": T, "T_bar": T_bar, "X": X, "Y": Y_11, "W": W, "s": s1[1:-1, np.newaxis], 
    "de_0": de_0, "de_1": de_1, "ie_0": ie_0, "ie_1": ie_1, "te": te, 
    "Y_t": beta*T, "Y_t_bar": Y_t_bar, "Y_x": Y_x, "U": U
}
with open("./synthetic_data.pkl", "wb") as fp:
    pkl.dump(data, fp)