In [None]:
import numpy as np
from src.synthetic_data import ex4_sample
from src.superlearner import SuperLearner
import sklearn
from statsmodels.api import GLM,families

In [None]:
seed = 1
np.random.seed(seed)
n = 10000
d = 5
k = 5
data = ex4_sample(n,d)
outcome_learners = [sklearn.linear_model.LogisticRegression(),sklearn.neighbors.KNeighborsClassifier()]
pt_learners = [sklearn.linear_model.LogisticRegression(),sklearn.neighbors.KNeighborsClassifier()]

In [None]:
# Step 1. Estimate Q0
trt = data['t']
y = data['y']
x = data['x']
x_outcome = np.column_stack((x,trt))

outcome_model = SuperLearner(k,seed,outcome_learners)
outcome_model.train_binary(x_outcome,y)

Q00 = outcome_model.predict_binary(np.column_stack((x,np.zeros_like(trt))))
Q01 = outcome_model.predict_binary(np.column_stack((x,np.ones_like(trt))))
Q0 = outcome_model.predict_binary(x_outcome)

In [None]:
# Step 2. Estimate g0
treatment_model = SuperLearner(k,seed,pt_learners)
treatment_model.train_binary(x,trt)

g0 = treatment_model.predict_binary(x)

In [None]:
# Step 3. Calculate clever covariate and epsilon
HA = trt/g0 - (1-trt)/(1-g0)
H1 = 1/g0
H0 = -1/(1-g0)
logit_Q0 = np.log(Q0/(1-Q0))
model = GLM(y, HA, family=families.Binomial(), offset=logit_Q0).fit()
eps = float(model.params[0])
print(eps)


In [None]:
# Step 4. Update outcome
logit_Q00 = np.log(Q00/(1-Q00))
logit_Q01 = np.log(Q01/(1-Q01))
logit_Q10 = logit_Q00 + eps*H0
logit_Q11 = logit_Q01 + eps*H1

Q10 = 1/(1+np.exp(-logit_Q10))
Q11 = 1/(1+np.exp(-logit_Q11))
Q1A = trt*Q11 + (1-trt)*Q10

In [None]:
# Step 5. Calculate ATE using the updated outcomes
ATE = np.mean(Q11) - np.mean(Q10)
# Influence curve
IC = HA*(y - Q1A) + Q11 - Q10 - ATE
ATE_var = np.mean((IC - np.mean(IC))**2)
ATE_sd = np.sqrt(ATE_var/n)

print(f"ATE (95% CI)= {ATE:.4f} ({ATE-1.96*ATE_sd:.4f},{ATE+1.96*ATE_sd:.4f})")