In [1]:
from datasets.whi import DataModuleWHI
from scipy.stats import bootstrap
import numpy as np
from ATE.ate_bounds import BootstrapSensitivityAnalysis

from ATE.methods.QB import QBSensitivityAnalysis
from ATE.methods.ZSB import ZSBSensitivityAnalysis
from test import  run_multiple_ate_hypothesis_test
from model import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [2]:
df_ctos = pd.read_csv('datasets/whi_processed/ctos_table.csv')
df_ctos['TOTPTIME'].fillna(0, inplace=True)


We look at the WHI dataset, selecting patients that started treatment at most 2 years ago. This is to make sure that transportability of CATE holds. 

In [3]:
whi = DataModuleWHI(root='datasets/')
whi.process_whi()
df_rct, df_obs, df_merged = whi.get_datasets()
df_obs = df_obs[0]
df_obs = df_merged[(df_ctos['OS']==1) & (df_ctos['TOTPTIME']<=20)]
df_merged = df_merged[ (df_ctos['OS']==0) | (df_ctos['TOTPTIME'] <=20)]

We first look at coronary heart disease, here from previous studies we expect small amount of confounding. We choose as covariates the standard ones from the epidemiology studies on WHI.

In [4]:
outcome = 'CHD_E'
#covariates = ['AGE', 'ETHNIC_Black or African-American', 'ETHNIC_Hispanic/Latino','ETHNIC_White (not of Hispanic origin)', 'BMI',
#'EDUC_x_College graduate or Baccalaureate Degree', 'EDUC_x_Didn\'t go to school', 'EDUC_x_Doctoral Degree (Ph.D,M.D.,J.D.,etc.)', 'EDUC_x_Grade school (1-4 years)',
#'EDUC_x_Grade school (5-8 years)','EDUC_x_Some high school (9-11 years)', 
#'EDUC_x_Some post-graduate or professional', 'EDUC_x_Vocational or training school', 'SMOKING_Never Smoked','SMOKING_Past Smoker','SMOKING_Current Smoker',
#'ETHNIC_American Indian or Alaskan Native','ETHNIC_Asian or Pacific Islander']

covariates = ['AGE','ETHNIC_White (not of Hispanic origin)', 
'BMI','SMOKING_Past Smoker','SMOKING_Current Smoker','EDUC_x_College graduate or Baccalaureate Degree',
'EDUC_x_Some post-graduate or professional', 'MENO', 'PHYSFUN']


others = [outcome,'HRTARM']
df_rct = df_rct[covariates + others]
df_obs = df_obs[covariates + others]
df_merged_covariates = df_merged[covariates]
df_merged = df_merged[covariates + others + ['OS']]

RCT suggest increased risk of CHD after taking HRT, in agreement with the epid. studies.

In [5]:
y1 = df_rct[df_rct['HRTARM'] == 1][outcome].mean()
y0 = df_rct[df_rct['HRTARM'] == 0][outcome].mean()
print(f"(RCT) E[Y(1)]: {y1}, E[Y(0)]: {y0}, ATE: {y1-y0}")

(RCT) E[Y(1)]: 0.019868328238890197, E[Y(0)]: 0.015921994569242162, ATE: 0.003946333669648035


In [6]:
y_rct = df_rct[outcome].to_numpy()
y_obs = df_obs[outcome].to_numpy()
x_rct = df_rct.iloc[:,:-2].to_numpy()
x_obs = df_obs.iloc[:,:-2].to_numpy()
t_obs = df_obs['HRTARM'].to_numpy()
t_rct= df_rct['HRTARM'].to_numpy()

In [7]:
clf = LogisticRegression()
clf.fit(x_obs, t_obs)
e_x=clf.predict_proba(x_obs)[:, 1] 

w1 = 1/e_x
w0 = 1/(1-e_x)
y1 = (y_obs*t_obs*w1).sum()/(t_obs*w1).sum()
y0 = (y_obs*(1-t_obs)*w0).sum()/((1-t_obs)*w0).sum()
print(f"(OBS) E[Y|T=1]: {y1},  E[Y|T=0]: {y0}, ATE: {y1-y0}")

(OBS) E[Y|T=1]: 0.0370562780805697,  E[Y|T=0]: 0.04455145931501153, ATE: -0.007495181234441831


We know proceed with the ATE test.

In [8]:
x = df_merged_covariates.to_numpy()
s = df_merged['OS'].to_numpy()

alpha_trim= 0.001
clf_pi =  RandomForestClassifier(max_depth=15, random_state=0)
#clf_pi = LogisticRegression()
clf_pi.fit(x, s)
pi_s = 1-clf_pi.predict_proba(x)[:,1]
O_idx =  np.logical_and(pi_s > alpha_trim, pi_s < 1-alpha_trim)
df_overlap = df_merged[O_idx]
df_overlap_obs = df_overlap[df_overlap['OS']==1] 
df_overlap_rct = df_overlap[df_overlap['OS']==0] 


In [9]:
y_rct = df_overlap_rct[outcome].to_numpy()
y_obs = df_overlap_obs[outcome].to_numpy()

x_rct = df_overlap_rct.iloc[:,:-3].to_numpy()
x_obs = df_overlap_obs.iloc[:,:-3].to_numpy()
t_obs = df_overlap_obs['HRTARM'].to_numpy()
t_rct= df_overlap_rct['HRTARM'].to_numpy()


In [10]:
mask = np.logical_and(O_idx, 1-s)  # \pi_S over RCT and \OO
rct_to_obs_ratio = (s[O_idx].sum() / (s[O_idx].size - s[O_idx].sum()))**-1
ys = 2 * (y_rct * t_rct - y_rct * (1 - t_rct)) * (1 - pi_s[mask]) / pi_s[mask]
bootstrap_rct = bootstrap((ys,), np.mean, n_resamples=500, axis=0)
std_rct = bootstrap_rct.standard_error
var_rct = np.power(std_rct, 2) * (rct_to_obs_ratio**2)
mean_rct = rct_to_obs_ratio * ys.mean()
print(f'mean_rct:{mean_rct}')

mean_rct:0.0029141646715864235


In [11]:
clf = LogisticRegression( )
clf.fit(x_obs, t_obs)
e_x=clf.predict_proba(x_obs)[:, 1] 

w1 = 1/e_x
w0 = 1/(1-e_x)
y1 = (y_obs*t_obs*w1).sum()/(t_obs*w1).sum()
y0 = (y_obs*(1-t_obs)*w0).sum()/((1-t_obs)*w0).sum()
print(f"(OBS) E[Y|T=1]: {y1},  E[Y|T=0]: {y0}, ATE: {y1-y0}")

(OBS) E[Y|T=1]: 0.0370562780805697,  E[Y|T=0]: 0.04455145931501153, ATE: -0.007495181234441831


In [24]:
from CATE.utils_cate_test import compute_bootstrap_variance
from CATE.cate_bounds import  MultipleCATEBoundEstimators
from test import run_multiple_cate_hypothesis_test, run_multiple_ate_hypothesis_test, construct_cate_test_statistic

ate = y_rct[t_rct == 1].mean() - y_rct[t_rct == 0].mean()
ate_variance = compute_bootstrap_variance(y_rct, t_rct, 100, arm=None)

bounds_estimator = MultipleCATEBoundEstimators(gammas=[1.3], n_bootstrap=30, binary=True, mu=LogisticRegression())

bounds_estimator.fit(x_obs, t_obs, y_obs, sample_weight=False)
dictionary_bounds_estimators = bounds_estimator.dict_bound_estimators


All CATE estimators are now instantiated.
All CATE bounds estimators are now trained. Elapsed time: 3.42 seconds


In [29]:
ate_lb, ate_ub = dictionary_bounds_estimators[str(1.3)].compute_ate_bounds(x_obs)
#var_lb, var_ub, quantile_lb, quantile_ub = dictionary_bounds_estimators[str(1.2)].estimate_bootstrap_variances(x_obs)



KeyError: '1.3'

In [14]:
(ate_ub-mean_rct)/(np.sqrt(var_rct+var_ub))

-2.2364050129648962

In [28]:
bootstrap_sa = BootstrapSensitivityAnalysis("QB", x_obs, t_obs, y_obs, [1.3], e_x_func=None, binary=True)
bounds_dist = bootstrap_sa.bootstrap(num_samples=50)

Outcome functions are now trained for QB. Starting bootstrap.
Elapsed time for 50 bootstrap samples: 9.38 seconds


In [27]:
ate_ub

0.0068641361046064085

In [23]:
np.mean(bounds_dist['1.4'][1])

0.005487749541212863

In [46]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
mu1 = LogisticRegression(
                    
                )
mu0 =  LogisticRegression(
                   
                )
mu1.fit(x_obs[t_obs==1], y_obs[t_obs==1])
mu0.fit(x_obs[t_obs==0], y_obs[t_obs==0])


In [47]:
(mu1.predict_proba(x_obs)[:,1] - mu0.predict_proba(x_obs)[:,1] ).mean()

-0.006183868873960963