## Import

In [1]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib.mlab   as mlab

from kte import kernel_two_sample_test_nonuniform
from xkte import kernel_two_sample_test_agnostic
from xkte import kernel_dr_two_sample_test_agnostic
from dr_kte import kernel_dr_nonuniform

from baselines import vanilla_dr_baseline_test
from baselines import BART_baseline_test
from baselines import CausalForest_baseline_test

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from scipy.spatial.distance import cdist
from scipy.special import expit
from scipy.stats import bernoulli


from tqdm import tqdm

import seaborn as sns
import pandas as pd
import time


## Preprocess data

In [2]:
df = pd.read_csv("idhp.csv", index_col=0)
df.head()

In [4]:
df.columns, df.shape

(Index(['iqsb.36', 'dose400', 'treat', 'bw', 'momage', 'nnhealth', 'birth.o',
        'parity', 'moreprem', 'cigs', 'alcohol', 'ppvt.imp', 'bwg', 'female',
        'mlt.birt', 'b.marry', 'livwho', 'language', 'whenpren', 'drugs',
        'othstudy', 'mom.lths', 'mom.hs', 'mom.coll', 'mom.scoll', 'site1',
        'site2', 'site3', 'site4', 'site5', 'site6', 'site7', 'site8',
        'momblack', 'momhisp', 'momwhite', 'workdur.imp', 'bwg.1', 'female.1',
        'mlt.birtF', 'b.marryF', 'livwhoF', 'languageF', 'whenprenF', 'drugs.1',
        'othstudy.1', 'momed4F', 'siteF', 'momraceF', 'workdur.imp.1'],
       dtype='object'),
 (985, 50))

In [5]:
covs_cont = ["bw","momage","nnhealth","birth.o","parity","moreprem","cigs","alcohol","ppvt.imp"]
covs_cat = ["bwg","female","mlt.birt","b.marry","livwho","language","whenpren","drugs","othstudy","mom.lths","mom.hs","mom.coll","mom.scoll",
            "site1","site2","site3","site4","site5","site6","site7","site8","momblack","momhisp","momwhite","workdur.imp"]
ty = ['iqsb.36', 'treat']

In [6]:
covs_cont = ["bw","momage","nnhealth","birth.o","parity","moreprem","cigs","alcohol","ppvt.imp"]
covs_cat = ["bwg","female","mlt.birt","b.marry","livwho","language","whenpren","drugs","othstudy"]

In [7]:
len(covs_cont), len(covs_cat)

(9, 9)

In [8]:
df1 = df[covs_cont + covs_cat + ty].dropna()
dX = df1[covs_cont + covs_cat].copy()
dX = dX.dropna()
scaler = StandardScaler()
dX[covs_cont] = scaler.fit_transform(dX[covs_cont])
X_original = np.array(dX[covs_cont+covs_cat])

T_original = np.array(df1['treat'])
Y_original = np.array(df1['iqsb.36']) / np.linalg.norm(np.array(df1['iqsb.36']))

In [10]:
len(T_original), T_original.sum()

(908, 347)

## Define experiments

In [9]:
noise_var = 0.5
beta_vec = np.ones(dX.shape[1])

In [11]:
name_folder = 'data_ihdp/'
size_subset = 500
num_experiments = 100
iterations=100

b_list = ['I', 'II', 'III', 'IV', 'V']
# method_list = ['Vanilla_DR', 'BART', 'CausalForest', 'KTE']
method_list = ['DR-xKTENEW', 'IPW-xKTENEW']
experiment = False

np.random.seed(0)

for b in b_list:
    for method in method_list:
        p_values = np.zeros(num_experiments)
        values = np.zeros(num_experiments)
        times = np.zeros(num_experiments)
        for n in tqdm(range(num_experiments)):
            
             
            idx = np.random.choice(np.arange(dX.shape[0]), size=size_subset, replace=False)
            X = X_original[idx,:].copy()
            T = T_original[idx].copy()

            if experiment:
                w = np.zeros(X.shape[0]) + 0.5
            else:
                w = LogisticRegression(C=1e6, max_iter=1000).fit(X, T).predict_proba(X)[:, 1]
                #w = Prob_vec.copy()

            Y = np.dot(beta_vec,X.T) + noise_var*np.random.randn(X.shape[0])

            if b == 'I':
                b1 = 0
                Y[T==1] += b1
            elif b == 'II':
                b1 = 1
                Y[T==1] += b1
            elif b == 'III':
                Z  = bernoulli.rvs(0.5,size=len(T[T==1]))
                beta = 2.
                b1 = (2*Z - 1)*beta
                Y[T==1] += b1
            elif b == 'IV':
                beta = 4
                b1 = np.random.uniform(-beta, beta, len(T[T==1]))
                Y[T==1] += b1
            elif b == 'V':
                Y = Y_original[idx]
            else:
                print('b not recognized! Setting b1 = 0.')
                b1 = 0
    
            
            
            YY0 = Y[T==0]
            YY1 = Y[T==1]
            
            Y = Y[:,np.newaxis]
            YY0 = YY0[:,np.newaxis]
            YY1 = YY1[:,np.newaxis]
        
            
            # Gaussian RBF kernel
            sigma2 = np.median(pairwise_distances(YY0, YY1, metric='euclidean'))**2

            if method == 'DR-xKTE':
                t0 = time.time()
                value, p_value = kernel_dr_two_sample_test_agnostic(Y, X, T, w,
                                                                    kernel_function='rbf',
                                                                    gamma=1.0/sigma2,
                                                                    verbose=False)
                times[n] = time.time() - t0
                p_values[n] = p_value
                values[n] = value

            elif method == 'IPW-xKTE':
                t0 = time.time()
                value, p_value = kernel_two_sample_test_agnostic(Y, T, w,
                                                                    kernel_function='rbf',
                                                                    gamma=1.0/sigma2,
                                                                    verbose=False)
                times[n] = time.time() - t0
                p_values[n] = p_value
                values[n] = value

            elif method == 'KTE':
                t0 = time.time()
                mmd2u_rbf, mmd2u_null_rbf, p_value = kernel_two_sample_test_nonuniform(YY0, YY1, T, w,
                                                                    kernel_function='rbf',
                                                                    gamma=1.0/sigma2,
                                                                    verbose=False,
                                                                    iterations=iterations)
                times[n] = time.time() - t0
                p_values[n] = p_value
            
            elif method == 'Vanilla_DR':
                T = T[:,np.newaxis]
                t0 = time.time()
                vanilla_dr = vanilla_dr_baseline_test(X, T, Y, iterations)
                p_value, value = vanilla_dr.permutation_test()
                times[n] = time.time() - t0
                p_values[n] = p_value
                values[n] = value

            elif method == 'BART':
                T = T[:,np.newaxis]
                t0 = time.time()
                bart = BART_baseline_test(X, T, Y, iterations)
                p_value, value = bart.permutation_test()
                times[n] = time.time() - t0
                p_values[n] = p_value
                values[n] = value
            elif method == 'CausalForest':
                T = T[:,np.newaxis]
                t0 = time.time()
                causal_forest = CausalForest_baseline_test(X, T, Y, iterations)
                p_value, value = causal_forest.permutation_test()
                times[n] = time.time() - t0
                p_values[n] = p_value
                values[n] = value
            else:
                print('Method not recognized.')



            res = pd.DataFrame()
            res['times'] = times
            res['p_values'] = p_values
            res['stat_values'] = values
            res.to_csv(name_folder + 'b' + b + method + '.csv')



100%|██████████| 100/100 [00:03<00:00, 30.26it/s]
100%|██████████| 100/100 [00:02<00:00, 45.31it/s]
100%|██████████| 100/100 [00:03<00:00, 29.78it/s]
100%|██████████| 100/100 [00:02<00:00, 39.07it/s]
100%|██████████| 100/100 [00:03<00:00, 26.27it/s]
100%|██████████| 100/100 [00:02<00:00, 36.95it/s]
100%|██████████| 100/100 [00:03<00:00, 26.03it/s]
100%|██████████| 100/100 [00:02<00:00, 37.21it/s]
100%|██████████| 100/100 [00:03<00:00, 26.74it/s]
100%|██████████| 100/100 [00:02<00:00, 38.91it/s]


## Define Scenario VI, where the ATE is substracted.

In [12]:
np.random.seed(0)

causal_forest = CausalForest_baseline_test(X_original, T_original, Y_original[:,np.newaxis], 100)
ate = causal_forest.ref_stat
ate

0.003471902998471982

In [15]:
name_folder = 'data_ihdp/'
size_subset = 500
num_experiments = 100
iterations=100

method_list = ['DR-xKTE', 'IPW-xKTE', 'Vanilla_DR', 'BART', 'CausalForest']
experiment = False

np.random.seed(0)

causal_forest = CausalForest_baseline_test(X_original, T_original, Y_original[:,np.newaxis], 100)
ate = causal_forest.ref_stat

for method in method_list:
    p_values = np.zeros(num_experiments)
    values = np.zeros(num_experiments)
    times = np.zeros(num_experiments)
    for n in tqdm(range(num_experiments)):


        idx = np.random.choice(np.arange(dX.shape[0]), size=size_subset, replace=False)
        X = X_original[idx,:].copy()
        T = T_original[idx].copy()

        if experiment:
            w = np.zeros(X.shape[0]) + 0.5
        else:
            w = LogisticRegression(C=1e6, max_iter=1000).fit(X, T).predict_proba(X)[:, 1]

        Y = Y_original[idx]
        Y[T==1] -= ate
        YY0 = Y[T==0]
        YY1 = Y[T==1]

        Y = Y[:,np.newaxis]
        YY0 = YY0[:,np.newaxis]
        YY1 = YY1[:,np.newaxis]


        # Gaussian RBF kernel
        sigma2 = np.median(pairwise_distances(YY0, YY1, metric='euclidean'))**2

        if method == 'DR-xKTE':
            t0 = time.time()
            value, p_value = kernel_dr_two_sample_test_agnostic(Y, X, T, w,
                                                                kernel_function='rbf',
                                                                gamma=1.0/sigma2,
                                                                verbose=False)
            times[n] = time.time() - t0
            p_values[n] = p_value
            values[n] = value

        elif method == 'IPW-xKTE':
            t0 = time.time()
            value, p_value = kernel_two_sample_test_agnostic(Y, T, w,
                                                                kernel_function='rbf',
                                                                gamma=1.0/sigma2,
                                                                verbose=False)
            times[n] = time.time() - t0
            p_values[n] = p_value
            values[n] = value
            
        elif method == 'KTE':
            t0 = time.time()
            mmd2u_rbf, mmd2u_null_rbf, p_value = kernel_two_sample_test_nonuniform(YY0, YY1, T, w,
                                                                kernel_function='rbf',
                                                                gamma=1.0/sigma2,
                                                                verbose=False,
                                                                iterations=iterations)
            times[n] = time.time() - t0
            p_values[n] = p_value

        elif method == 'Vanilla_DR':
            T = T[:,np.newaxis]
            t0 = time.time()
            vanilla_dr = vanilla_dr_baseline_test(X, T, Y, iterations)
            p_value, value = vanilla_dr.permutation_test()
            times[n] = time.time() - t0
            p_values[n] = p_value
            values[n] = value

        elif method == 'BART':
            T = T[:,np.newaxis]
            t0 = time.time()
            bart = BART_baseline_test(X, T, Y, iterations)
            p_value, value = bart.permutation_test()
            times[n] = time.time() - t0
            p_values[n] = p_value
            values[n] = value
        elif method == 'CausalForest':
            T = T[:,np.newaxis]
            t0 = time.time()
            causal_forest = CausalForest_baseline_test(X, T, Y, iterations)
            p_value, value = causal_forest.permutation_test()
            times[n] = time.time() - t0
            p_values[n] = p_value
            values[n] = value
        else:
            print('Method not recognized.')



        res = pd.DataFrame()
        res['times'] = times
        res['p_values'] = p_values
        res['stat_values'] = values
        res.to_csv(name_folder + 'b' + 'VI' + method + '.csv')

100%|██████████| 100/100 [00:02<00:00, 34.67it/s]
100%|██████████| 100/100 [00:01<00:00, 52.12it/s]
