# Proximal Causal Inference Implementation
The following code implements recovering the average causal effect between a treatment A on an outcome Y in the proximal causal inference setup where W and Z are proxies of U by fitting two linear regressions. Here, U, W, and Z are binary variables, so the first model is a linear logistic regression. On the other hand, Y is a continuous variable, so the second model is a normal linear logistic regression.

In [24]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm

np.random.seed(0)
size = 10000
verbose = True

# U = np.random.normal(0, 1, size)
U = np.random.binomial(1, 0.48, size)

# W = np.random.normal(0, 1, size) + 1.3*U
W = np.random.binomial(1, expit(1.3*U), size)

# Z = np.random.normal(0, 1, size) + 2*U
Z = np.random.binomial(1, expit(2*U), size)

A = np.random.binomial(1, expit(0.8*U), size)
if verbose:
    print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*U

data = pd.DataFrame({"U": U, "W": W, "A": A, "Y": Y, "Z": Z})

0.58924


Code implementing estimating the average causal effect in the linear case.

In [20]:
def proximal_find_ace(A, Y, W, Z, data):
    # fit a model W~A+Z
    formula = W+"~"+A+"+"+Z
    model1 = sm.GLM.from_formula(formula=formula, data=data, family=sm.families.Binomial()).fit()

    # make predictions for What
    What = model1.predict(data)
    data["What"] = What

    # fit a model Y~A+What
    formula = Y+"~"+A+"+What"
    model2 = sm.GLM.from_formula(formula=formula, data=data, family=sm.families.Gaussian()).fit()

    # the ACE is the coefficient for A in model2
    return model2.params[1]

print(proximal_find_ace("A", "Y", "W", "Z", data))

1.3468085291324319


In [21]:
def compute_confidence_intervals(A, Y, W, Z, data, num_bootstraps=200, alpha=0.05):
    """
    Compute confidence intervals for backdoor adjustment via bootstrap
    
    Returns tuple (q_low, q_up) for the lower and upper quantiles of the confidence interval.
    """
    
    Ql = alpha/2
    Qu = 1 - alpha/2
    # two lists for the two indexes of output
    estimates = []
    
    for i in range(num_bootstraps):
        
        # resample the data with replacement
        data_sampled = data.sample(len(data), replace=True)
        data_sampled.reset_index(drop=True, inplace=True)
        
        # add estimate from resampled data
        output = proximal_find_ace(A, Y, W, Z, data_sampled)
        estimates.append(output)

    # calculate the quantiles
    quantiles = np.quantile(estimates, q=[Ql, Qu])
    q_low = quantiles[0]
    q_up = quantiles[1]
    
    return (q_low, q_up)

In [22]:
def generate_data():
    U = np.random.binomial(1, 0.48, size)

    X1 = np.random.normal(0, 1, size) + 2*U

    # make sure that X2 is some non-linear function
    X2 = np.random.normal(0, 1, size) + np.exp(X1) + U

    X3 = np.random.normal(0, 1, size) + 1.3*U

    # make sure that X4 is some non-linear function
    X4 = np.random.normal(0, 1, size) + X3**2 + 0.5*X3**3 + U

    A = np.random.binomial(1, expit(0.8*U), size)

    Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*U

    data = pd.DataFrame({"U": U, "X1": X1, "X2": X2, "X3": X3, "X4": X4, "A": A, "Y": Y})

    return data

### Synthetic Procedure Setup
We first generate a dataset so that we can train two different models, one for predicting Z and one for predicting W. We then generate a second dataset according to the same DGP and make predictions for Z and W from that dataset. Using those predictions for Z and W, we use proximal causal inference to recover the ACE in the linear case.

In [25]:
data1 = generate_data()

# fit a model U~X1+X2+X1*X2, this model will be used to predict Z
# we add the interaction term to make sure that the model is non-linear
modelZ = sm.GLM.from_formula(formula="U~X1+X2+X1*X2", data=data1, family=sm.families.Binomial()).fit()
# print(modelZ.params)

# fit a model U~X3+X4+X3*X4, this model will be used to predict W
modelW = sm.GLM.from_formula(formula="U~X3+X4+X3*X4", data=data1, family=sm.families.Binomial()).fit()
# print(modelW.params)

data2 = generate_data()

# make predictions for Z and W based off of our previosly trained models on a different dataset
Z = modelZ.predict(data2)
W = modelW.predict(data2)

# evaluate the accuracy of the predictions for U when using modelZ
cnt = 0
correct_pred = 0
for row in Z:
    prediction = 0
    if row > 0.5:
        prediction = 1
        
    if prediction == data2["U"][cnt]:
        correct_pred += 1
        
    cnt += 1
print("Z accuracy: ", correct_pred/len(Z))

# evaluate the accuracy of the predictions for U when using modelW
cnt = 0
correct_pred = 0
for row in W:
    prediction = 0
    if row > 0.5:
        prediction = 1
        
    if prediction == data2["U"][cnt]:
        correct_pred += 1
        
    cnt += 1
print("W accuracy: ", correct_pred/len(Z))

# add the predictions into our dataframe
data2["Z"] = Z
data2["W"] = W

print(proximal_find_ace("A", "Y", "W", "Z", data2))
print(compute_confidence_intervals("A", "Y", "W", "Z", data2))

Z accuracy:  0.86074
W accuracy:  0.77368
1.2920989264717164
