In [1]:
import numpy as np
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from bound_funcs import *
from utils import *
from dgp import *
from vset_construction import *


In [2]:
Dx, Du = 5, 0
nD = Dx+Du

dgp = {
    'N': 10000,
    'Dx': Dx,
    'Du': Du,
    'nz': 10,                # Number of finite pre-treatment values
    'beta_zy': 0,            # Z -> Y loading (=0 ==> exclusion restriction is satisfied)
    'e1_coeffs': 4*np.random.rand(nD) - 2,
    'z_coeffs':  4*np.random.rand(nD) - 2,
    'mu1_coeffs': 4*np.random.rand(nD) - 1,
    'mu0_coeffs': 4*np.random.rand(nD) - 2,
    'lambda': 2
}

msm_dgp = set_dgp_config(dgp, 'MSM')
data = generate_data(msm_dgp)


In [3]:


def plugin_nuisance_probs(in_dgp, out_dgp, in_data, out_data):
            
    # Train model via data from held-out folds
    XU, D, Y, Z, p_mu1, p_e1 = out_data['XU'], out_data['D'], out_data['Y'], out_data['Z'], out_data['p_mu_1'], out_data['p_e1']
    
    # We don't have access to confounders when computing bounds.
    mask = np.ones(XU.shape[1])
    mask[out_dgp['Dx']:] = 0
    X = XU[:,mask==1].copy()
    XZ = np.concatenate((X, Z.reshape(-1,1)), axis=1)

    mu_hat = LogisticRegression()
    e1_hat = LogisticRegression()

    mu_hat.fit(XZ[D==1], Y[D==1])
    e1_hat.fit(XZ, D)
    
    # Regress models on data from fold k
    XU_k, D_k, Y_k, Z_k, p_mu1_k, p_e1_k = in_data['XU'], in_data['D'], in_data['Y'], in_data['Z'], in_data['p_mu_1'], in_data['p_e1']
    
    # We don't have access to confounders when computing bounds.
    mask = np.ones(XU_k.shape[1])
    mask[in_dgp['Dx']:] = 0
    X_k = XU_k[:,mask==1].copy()
    XZ_k = np.concatenate((X_k, Z_k.reshape(-1,1)), axis=1)

    p_mu1_z = np.zeros((in_dgp['nz'], in_dgp['N']))
    p_e1_z = np.zeros((in_dgp['nz'], in_dgp['N']))

    for z in range(in_dgp['nz']):
        Zn = (z*np.ones_like(Z_k)).reshape(-1,1)
        Xz_k = np.concatenate((X_k, Zn), axis=1)
        p_mu1_z[z] = mu_hat.predict_proba(Xz_k)[:,1]
        p_e1_z[z] = e1_hat.predict_proba(Xz_k)[:,1]
        

    p_mu1_hat = mu_hat.predict_proba(XZ_k)[:,1]
    p_e1_hat = e1_hat.predict_proba(XZ_k)[:,1]
    
    print('outcome regression error:', (np.abs(p_mu1_hat - p_mu1_k)).mean())
    print('propensitiy error:', (np.abs(p_e1_hat - p_e1_k)).mean())

    return {
        'p_mu1': p_mu1_hat,
        'p_e1': p_e1_hat,
        'p_mu1_z': p_mu1_z,
        'p_e1_z': p_e1_z
    }
    

## Test learning outcome probabilities

In [25]:

id_methods = ['NA', 'MSM', 'IV']
est_methdods = ['oracle', 'plugin']


def estimate_bounds(dgp, data, id_method, est_method, K=5):
    '''This function will call the appropriate nuisance estimation function and identification approach'''
    
    if est_method == 'oracle':        
        probs = oracle_nuisance_probs(dgp, data)
        Vpf_down, Vpf_up = get_vset(dgp, data, probs, id_method)
        bdf = get_bounds(data, Vpf_down, Vpf_up, verbose=False)

    if est_method == 'plugin':
        bdf = sample_split_crossfit(dgp, data, id_method, est_method, K)
        
    bdf['id_method'] = id_method
    bdf['est_method'] = est_method
    
    return bdf
    
def sample_split_crossfit(dgp, data, id_method, est_method, K):
    
    in_folds, out_folds = k_fold_split_and_complement(data, K)

    for k in range(K):

        # Set-up datasets for fold K
        in_data, out_data = in_folds[k], out_folds[k]
        in_dgp, out_dgp = dgp.copy(), dgp.copy()
        in_dgp['N'] = in_data['XU'].shape[0]
        out_dgp['N'] = out_data['XU'].shape[0]

        # Learn models, then run inference via data from fold k
        in_probs = plugin_nuisance_probs(in_dgp, out_dgp, in_data, out_data)
        Vpf_down, Vpf_up = get_vset(in_dgp, in_data, in_probs, id_method)
        fold_bdfs.append(get_bounds(data, Vpf_down, Vpf_up, verbose=False)) 

    return average_numeric_dataframes(fold_bdfs)


In [21]:
Dx, Du = 5, 0
nD = Dx+Du

dgp = {
    'N': 200,
    'Dx': Dx,
    'Du': Du,
    'nz': 10,                # Number of finite pre-treatment values
    'beta_zy': 0,            # Z -> Y loading (=0 ==> exclusion restriction is satisfied)
    'e1_coeffs': 4*np.random.rand(nD) - 2,
    'z_coeffs':  4*np.random.rand(nD) - 2,
    'mu1_coeffs': 4*np.random.rand(nD) - 1,
    'mu0_coeffs': 4*np.random.rand(nD) - 2,
    'lambda': 2,
    'id_assumption': 'MSM'
}



In [23]:
msm_dgp = set_dgp_config(dgp, 'MSM')
data = generate_data(msm_dgp)
estimate_bounds(msm_dgp, data, id_method='MSM', est_method='oracle', K=5)

Unnamed: 0,Rs_down,Rs_up,Rd_down,Rd_up,R_oracle,metric,id_method,est_method
0,-0.528405,0.07884,-0.433492,-0.0653,-0.179487,m_y=1,MSM,oracle
1,-0.454066,0.183233,-0.313544,0.006644,-0.216867,m_y=0,MSM,oracle
2,-0.699734,0.492694,-0.298489,0.091449,0.029279,m_a=0,MSM,oracle
3,-0.226724,0.168863,-0.226724,0.168863,-0.008586,m_a=1,MSM,oracle
4,-0.454547,0.344547,-0.173676,0.063676,-0.015,m_u,MSM,oracle


In [24]:
estimate_bounds(msm_dgp, data, id_method='MSM', est_method='plugin', K=5)

outcome regression error: 0.09002014850944906
propensitiy error: 0.0566678861802201


NameError: name 'id_method' is not defined

In [18]:
iv_dgp = set_dgp_config(dgp, 'IV')
data = generate_data(iv_dgp)

estimate_bounds(iv_dgp, data, id_method='IV', est_method='oracle', K=5, crossfit=True)

Unnamed: 0,Rs_down,Rs_up,Rd_down,Rd_up,R_oracle,metric,id_method,est_method
0,-0.335271,-0.263991,-0.320232,-0.279389,-0.43617,m_y=1,IV,oracle
1,-0.364918,-0.297379,-0.351109,-0.311496,-0.481132,m_y=0,IV,oracle
2,-0.455268,-0.240426,-0.414482,-0.281211,0.021739,m_a=0,IV,oracle
3,0.110129,0.148487,0.110129,0.148487,0.032468,m_a=1,IV,oracle
4,-0.005033,0.075033,0.023109,0.046891,0.05,m_u,IV,oracle


In [19]:
estimate_bounds(iv_dgp, data, id_method='IV', est_method='plugin', K=5, crossfit=True)

outcome regression error: 0.05176541612632213
propensitiy error: 0.05538888820703547
outcome regression error: 0.1169573155047626
propensitiy error: 0.03581346114160754
outcome regression error: 0.08597618103278762
propensitiy error: 0.05990041365376706
outcome regression error: 0.04516182766757439
propensitiy error: 0.05960243873430049
outcome regression error: 0.09450084060275317
propensitiy error: 0.08161816974835559


Unnamed: 0,Rs_down,Rs_up,Rd_down,Rd_up,R_oracle,metric,id_method,est_method
0,-0.473686,-0.327376,-0.445582,-0.356746,-0.43617,m_y=1,IV,plugin
1,-0.4954,-0.359345,-0.469875,-0.385927,-0.481132,m_y=0,IV,plugin
2,-0.255785,0.095384,-0.188168,0.027768,0.021739,m_a=0,IV,plugin
3,0.014322,0.074197,0.014322,0.074197,0.032468,m_a=1,IV,plugin
4,-0.030217,0.100217,0.016439,0.053561,0.05,m_u,IV,plugin
