In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
import scipy.sparse as sp	
from statsmodels.api import Logit

import gurobipy as gp
from gurobipy import GRB

This is an implementation of the simulation section of Yu's Balancing Weights Causal Inference in Observational Factorial Studies paper (Yu 5.1).

### Data Initialization

In [131]:
rseed = 420
np.random.seed(rseed)
n = 500 # set as 500, 1000, 2000
rho = 0 # set as 0, 0.2, 0.4 for each scenario

Following the specification in Yu: "the treatment assignment mechanism for $Z_{ik}$ is independent across $k$'s and satisfies a logistic regression that $P(Z_{ik} = 1) = \frac{1}{(1+exp(-\beta_k^T X_i))}$ where "$\beta_1 = (\frac{1}{4}, \frac{2}{4}, 0, \frac{3}{4}, 1)$, $\beta_2 = (\frac{3}{4}, \frac{1}{4}, 1, 0, \frac{2}{4})$, $\beta_3 = (1, 0, \frac{3}{4}, \frac{2}{4}, \frac{1}{4})$."

This treatment assignment ensures all $2^3=8$ treatment combination groups are non-empty and observed so that the paper's proposed weighting estimators are applicable. We additionally assume conditional independence of factors given covariates, and that only the main factors of the three treatments are non-negligible.

In [132]:
# Define mean vector (mu)
mu = np.array([0.1, 0.1, 0.1, 0, 0]).T

# Define covariance matrix (Sigma) with 5 covariates, defined as according to paper specifications
#   - Diagonal filled with ones, rest with correlation coefficient rho
Sigma = np.full((5, 5), rho)
np.fill_diagonal(Sigma, 1)  

# Generate covariates (X) from multivariate normal
np.random.seed(rseed)
X = multivariate_normal.rvs(mean=mu, cov=Sigma, size=n)

In [133]:
# Define beta coefficients for treatment assignments to fulfill all treatment outcomes observed condition
beta_1 = np.array([1/4, 2/4, 0, 3/4, 1])
beta_2 = np.array([3/4, 1/4, 1, 0, 2/4])
beta_3 = np.array([1, 0, 3/4, 2/4, 1/4])

# Logistic function to generate treatment assignments
def logistic_prob(X, beta):
    return 1 / (1 + np.exp(-X @ beta))

# Generate treatment assignments independently
Z1 = np.random.binomial(1, logistic_prob(X, beta_1), size=n)
Z2 = np.random.binomial(1, logistic_prob(X, beta_2), size=n)
Z3 = np.random.binomial(1, logistic_prob(X, beta_3), size=n)

For our problem setup, we next consider three outcome models, with errors following standard normal distribution.
    
An additive outcome: 
$Y_{i1} = 2\sum_{k=1}^5 X_{ik} + \sum_{j=1}^3 Z_{ij} + \epsilon_{i1}$ 
    
A heterogeneous treatment effect outcome: 
$Y_{i2} = 2\sum_{k=1}^5 X_{ik} + \sum_{k=1}^5 X_{ik} \sum_{j=1}^3 Z_{ij} + \epsilon_{i2}$ 
    
A misspecified outcome:
$Y_{i3} = sin(X_{i1}) + cos(X_{i2}) + (min(1, X_{i1}) + X_{i2})Z_{i1} + \sum_{k=1}^5 X_{ik} \sum_{j=2}^3 Z_{ij} + \epsilon_{i3}$

Additionally, four estimators are implemented for each main effect $\tau_k, k=1, 2, 3$ under each outcome model:
- The additive regression estimator
- The interaction regression estimator
- The weighting estimator under general additive model assumption and covariate basis functions $h_s(X) = X_s, s=1,... 5$
- The weighting estimator with balance constraints under the outcome model specification with treatment effect heterogeneity, and the same basis functions

https://par.nsf.gov/servlets/purl/10337012 - Supplementary reading to learn more about regression based methods causal inference with factorial experiments

### Outcome Model Simulation

First, making the "true" outcomes Y1, Y2, Y3 according to the outcome models specified in each simulation.

In [134]:
np.random.seed(rseed)
# From top to bottom: defining general, heterogeneous treatment effect, and misspecified outcome models
Y1 = 2*np.sum(X, axis=1) + Z1 + Z2 + Z3 + np.random.normal(0, 1, n)
Y2 = 2*np.sum(X, axis=1) + np.sum(X, axis=1)*(Z1 + Z2 + Z3) + np.random.normal(0, 1, n)
Y3 = np.sin(X[:,0]) + np.cos(X[:,1]) + (np.minimum(1, X[:,0]) + X[:,1]) * Z1 + np.sum(X, axis=1) * (Z2 + Z3) + np.random.normal(0, 1, n)

Next, we define three separate $N \times 2^K$ potential outcomes matrices $Y_p$ for each outcome model , where $K=3$, the number of main effects under analysis. Then each row for a unit $i$ is representative of the potential outcomes for 8 different treatment combinations of the factors Z1, Z2, and Z3 under the specified outcome model. 

As part of this process we use an assignment matrix $W$ to encode the 8 possible different treatment combinations of Z1, Z2, Z3 main effects. This is a purely theoretical construction.

In [135]:
# define the assignment matrix W for the 8 different unique treatment combinations under 3 main effect factors
W = np.array([[1, 1, 1, 1, 0, 0, 0, 0],  # Each column is a unique treatment combination from 3 binary factors, each row is a factor
             [1, 1, 0, 0, 1, 1, 0, 0],
             [1, 0, 1, 0, 1, 0, 1, 0]])
treat_combs = W.shape[1]
Y1_out = np.zeros((n, treat_combs))
Y2_out = np.zeros((n, treat_combs))
Y3_out = np.zeros((n, treat_combs))
z1, z2, z3 = W
for i in range(n):
    Y1_out[i] = 2*np.sum(X[i]) + z1 + z2 + z3 + np.random.normal(0, 1)
    Y2_out[i] = 2*np.sum(X[i]) + np.sum(X[i])*(z1 + z2 + z3) + np.random.normal(0, 1)
    Y3_out[i] = np.sin(X[i,0]) + np.cos(X[i,1]) + (np.minimum(1, X[i,0]) + X[i,1]) * z1 + np.sum(X[i]) * (z2 + z3) + np.random.normal(0, 1)

Following Dasgupta et al. [2015], all potential outcomes for a unit $i$ are comprised of a vector $Y_i$ of dimension $J$, with $J=2^K$ the number of possible values for $z$, the treatment assignment. Then $Y_i(z)$ denotes the potential outcome of the $i$ th unit when exposed to treatment $z$.

We then define an $N \times 2^K$ potential outcomes matrix $Y$ such that the $i$ th row is a $J$-vector.

We've encoded this relationship into the three separate potential outcomes matrices Y1_out, Y2_out, and Y3_out for each outcome model, such that each row Yk_out(i) is a vector of length 8, for the 8 possible combinations of 3 treatments.

We then define a contrast vector g. The purpose of the contrast vector is to create a difference between "one half of the potential outcomes with the other half of potential outcomes" i.e, each main effect then being the "difference of the averages of the potential outcomes when (a factor) is at its high level and at its low level".

In [136]:
# Define contrasts vector as all possible treatment assignments
g = np.array([[1, 1, 1, 1, -1, -1, -1, -1],  # Each row is a unique treatment combination from 3 binary factors, each column is a factor
             [1, 1, -1, -1, 1, 1, -1, -1],
             [1, -1, 1, -1, 1, -1, 1, -1]])

The main effect of each treatment $z_k$, $\tau_k$ is a comparison between the average potential outcomes of receiving treatment $z_k$ and the average potential outcomes of note receiving it, averaged over all treatment combinations of other factors, expressed as:

$\tau_k = \frac{1}{2^{K-1}} g_k^T E[Y]$. 

Where $E[Y] = (E[Y(z)])_{z\in Z}$.

For the purposes of this simulation, we don't attempt to derive the interaction effects $\tau_{k, k'}$ which measure the interaction between two factors $z_k$ and $z_k'$. We estimate 9 main treatment effects, 3 treatment effects for each of the three outcome models.

In [137]:
# Define column vector of expectations for each outcome model
EY1_out = np.mean(Y1_out, axis=0)
EY2_out = np.mean(Y2_out, axis=0)
EY3_out = np.mean(Y3_out, axis=0)

# Define three the main treatment effects tau_k for each outcome model through matrix multiplication
tau_Y1 = g @ EY1_out / 4  # Divide by 2^(K-1) = 2^2
tau_Y2 = g @ EY2_out / 4
tau_Y3 = g @ EY3_out / 4
tau = [tau_Y1, tau_Y2, tau_Y3]
print(tau_Y1)
print(tau_Y2)
print(tau_Y3)
tau

[1. 1. 1.]
[0.35408177 0.35408177 0.35408177]
[0.15492846 0.35408177 0.35408177]


[array([1., 1., 1.]),
 array([0.35408177, 0.35408177, 0.35408177]),
 array([0.15492846, 0.35408177, 0.35408177])]

Using OLS for implementing additive regression $\hat{\beta} = (X^T X)^-1 X^T Y$ on all three outcome models. Specifically, we yield the additive regression main effect estimates for $\tau_k, k=1, 2, 3$ by regressing our outcome variable $Y$, specified by outcome model, on the covariates $X$ and treatments $Z$ for $j=1,...,5$, $k=1, 2, 3$ and multiplying the coefficient for Z by 2.

In [138]:
# Implementing the additive regression model:
X_design = np.column_stack(((np.ones(n)), X, Z1, Z2, Z3))
reg1_res = []
for i, y in enumerate([Y1, Y2, Y3]):
    beta_hat = np.linalg.inv(X_design.T @ X_design) @ X_design.T @ y
    print(f"\nEstimated factorial effects (additive model) for Y{i+1}", 2*beta_hat[6:])
    reg1_res.append(2*beta_hat[6:9])


Estimated factorial effects (additive model) for Y1 [2.23157395 2.10632007 2.16867336]

Estimated factorial effects (additive model) for Y2 [0.20024674 0.64032887 0.36124899]

Estimated factorial effects (additive model) for Y3 [0.30381222 0.5059655  0.35592449]


In [139]:
print("Showing results for additive regression: \n")
for i in range(len(reg1_res)):
    print(f"Y{i+1} result: {reg1_res[i]}")
    print(f"Tau result: {tau[i]}")
    rmse = np.mean(np.power(reg1_res[i] - tau[i], 2))
    print(f"RMSE Y{i+1}: {rmse}\n")

Showing results for additive regression: 

Y1 result: [2.23157395 2.10632007 2.16867336]
Tau result: [1. 1. 1.]
RMSE Y1: 1.3688386293846

Y2 result: [0.20024674 0.64032887 0.36124899]
Tau result: [0.35408177 0.35408177 0.35408177]
RMSE Y2: 0.035217994627275144

Y3 result: [0.30381222 0.5059655  0.35592449]
Tau result: [0.15492846 0.35408177 0.35408177]
RMSE Y3: 0.015079479986604775



And now adding interaction terms between covariates and treatment assignments, although we only recover the main effects as non-negligible coefficients.

In [140]:
# Implementing the interaction regression model:
XZ_interaction_terms = np.hstack([(X[:, j:j+1] * Z1.reshape(-1, 1)) for j in range(5)] +
                              [(X[:, j:j+1] * Z2.reshape(-1, 1)) for j in range(5)] +
                              [(X[:, j:j+1] * Z3.reshape(-1, 1)) for j in range(5)])

X_design = np.column_stack(((np.ones(n)), X, Z1, Z2, Z3, XZ_interaction_terms))

# Displaying estimated coefficients
reg2_res = []
for i, y in enumerate([Y1, Y2, Y3]):
    beta_hat = np.linalg.inv(X_design.T @ X_design) @ X_design.T @ y
    print(f"\nEstimated factorial effects (interaction model) for Y{i+1}", 2*beta_hat[6:9])
    reg2_res.append(2*beta_hat[6:9])


Estimated factorial effects (interaction model) for Y1 [2.2122818  2.07178055 2.12701063]

Estimated factorial effects (interaction model) for Y2 [-0.08854273  0.3104213  -0.10827456]

Estimated factorial effects (interaction model) for Y3 [ 0.05709784  0.2341859  -0.02592347]


In [141]:
print("Showing results for interactive regression: \n")
for i in range(len(reg2_res)):
    print(f"Y{i+1} result: {reg2_res[i]}")
    print(f"Tau result: {tau[i]}")
    rmse = np.mean(np.power(reg2_res[i] - tau[i], 2))
    print(f"RMSE Y{i+1}: {rmse}\n")

Showing results for interactive regression: 

Y1 result: [2.2122818  2.07178055 2.12701063]
Tau result: [1. 1. 1.]
RMSE Y1: 1.2961645614888395

Y2 result: [-0.08854273  0.3104213  -0.10827456]
Tau result: [0.35408177 0.35408177 0.35408177]
RMSE Y2: 0.13719868382470526

Y3 result: [ 0.05709784  0.2341859  -0.02592347]
Tau result: [0.15492846 0.35408177 0.35408177]
RMSE Y3: 0.05611660928871698



Now using Gurobi to implement the proposed weighting method, but first we have to set up the balancing constraints. We simulate using both the additive balance constraints and the interaction balance constraints. We use basis functions $h_s(X) = X_s$, meaning basis functions as just the unmodified, corresponding covariate column to each $s\in S $; the raw value of the covariate. From this we define the following balancing constraints:

We've already defined a contrast vector $g_k=(g_{kz})_{z \in Z}$, a $2^K$ dimensional vector with half +1s and half -1s, indicating if a combination has $z_k = +1$ or $z_k = -1$. Given that we're only considering main effects we only devise a contrast vector accounting for all $2^3 = 8$ possible treatment combinations, with 3 binary factors.

For our balancing constraints, we specifically decompose $g_{Kz}$, the contrast coefficient of the expected potential outcome under treatment combination $z$, as $g_Kz = g_{Kz}^{+} - g_{Kz}^{-}$. $g_{Kz}^{+} = max(g_{Kz}, 0)$ and $g_{Kz}^{-} = max(-g_{Kz}, 0)$.


In [114]:
g_p = np.maximum(g, 0)  # Computes element wise maximum to decompose the contrast vector
g_m = np.maximum(-g, 0)

Finally, let $I(Z_i = z)$ be an indicator for whether an individual $i$ received treatment combination $z$.

We then let $A_{iK}^{\Omega} = \sum_{z\in Z} g_{Kz}^{\Omega} I(Z_i = z)$ denote whether an individual belongs to the positive or negative part of the contrast $g_K$, with $\Omega=+, -$.

In [10]:
# Define indicator vector -- since we only have the single contrast vector for main effects, have two A vectors
p = g.shape[0]
A_p = np.zeros((n, p))
A_m = np.zeros((n, p))
for i in range(n): # iterate through all individuals
    # obs = (Z1[i], Z2[i], Z3[i])
    ind = -1
    for j in range(g.shape[1]):
        if Z1[i] == g_p[0, j] and Z2[i] == g_p[1, j] and Z3[i] == g_p[2, j]:
            ind = j
            break
    if ind != -1:
        for k in range(g.shape[0]):
            if g_p[k, ind] == 1:
                A_p[i, k] = 1  # assign to indicate positive part of contrast vector
            else:
                A_m[i, k] = 1  # assign to indicate negative part of contrast vector

We use a basis function $q_{sJ}(X_i, Z_i) = h_s(X_i) \Pi_{j \in J} Z_{ij}, s = 1,..., S, J \in [K]_{K'}$ Because we only consider the main effects non-negligible / non-zero, then $K' = 1$. The specific covariate basis function is $h_s(X) = X_s, s=1, ..., 5$, meaning the basis function used is just each of the five covariates themselves.

In [11]:
# we can just define them outright
h = np.copy(X)