In [1]:
import numpy as np
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from bound_funcs import *
from utils import *
from dgp import *
from vset_construction import *




In [2]:
Dx, Du = 1, 2
nD = Dx+Du

dgp = {
    'N': 50,
    'Dx': Dx,
    'Du': Du,
    'nz': 10,                # Number of finite pre-treatment values
    'nw': 10,                # Number of finite post-treatment values
    'beta_zy': 0,            # Z -> Y loading (=0 ==> exclusion restriction is satisfied)
    'e1_coeffs': 4*np.random.rand(nD) - 2,
    'z_coeffs':  4*np.random.rand(nD) - 2,
    'mu1_coeffs': 4*np.random.rand(nD) - 1,
    'mu0_coeffs': 4*np.random.rand(nD) - 2,
    'lambda': 2,
    'id_assumption': 'MSM'
}

# Measured and unmeasured covariate loadings
if dgp['id_assumption'] == 'MSM':
    dgp['lambda_star'] = np.random.uniform(1, dgp['lambda'])
    dgp['beta_zd'] = 0
    dgp['beta_zy'] = 0
    dgp['z_coeffs'] = np.zeros_like(dgp['z_coeffs'])
    
if dgp['id_assumption'] == 'IV':
    dgp['beta_zd'] = 2
    dgp['beta_zy'] = 0
    dgp['z_coeffs'][Dx:] = 0 # IV Unconfoundedness



data = generate_data(dgp)
# Hmmm something is fishy the bounds for m_y=1 shouldn't match those for m_y==0. 

# Next up, need to learn nusiance functions on one fold then construct estimates over another fold



In [21]:


# A wrapper around these should assert: 
# - IV/MSM intervals are contained within the worst case interval for each v term





In [3]:
def compute_nuisance_probs(dgp, data, setting):
    
    XU, D, Y, Z, pY, pD = data['XU'], data['D'], data['Y'], data['Z'], data['pY'], data['pD']
    
    # We don't have access to confounders when computing bounds.
    mu1_coeffs = dgp['mu1_coeffs'].copy()
    e1_coeffs = dgp['e1_coeffs'].copy()
    mu1_coeffs[Dx:] = 0
    e1_coeffs[Dx:] = 0

    if setting=='oracle':
        p_mu1 = mu(dgp, mu1_coeffs, XU, Z)
        p_e1 = e1(dgp, e1_coeffs, XU, Z)
        
        p_mu1_z = mu_down_z = np.zeros((dgp['nz'], dgp['N']))
        p_e1_z = mu_down_z = np.zeros((dgp['nz'], dgp['N']))
        
        for z in range(dgp['nz']):
            p_mu1_z[z] = mu(dgp, mu1_coeffs, XU, z)
            p_e1_z[z] = e1(dgp, e1_coeffs, XU, z)
            
            
    return {
        'p_mu1': p_mu1,
        'p_e1': p_e1,
        'p_mu1_z': p_mu1_z,
        'p_e1_z': p_e1_z
    }
            
    

In [4]:
nuisance_probs = compute_nuisance_probs(dgp, data, setting='oracle')

Vpf_down, Vpf_up = compute_iv_bounds(dgp, data, nuisance_probs)
check_bounds(data, Vpf_down, Vpf_up)
# print()
Vpf_down, Vpf_up = compute_na_bounds(dgp, data, nuisance_probs)
check_bounds(data, Vpf_down, Vpf_up)
print()

# dgp['lambda'] = 1.5
Vpf_down, Vpf_up = compute_msm_bounds(dgp, data, nuisance_probs)
check_bounds(data, Vpf_down, Vpf_up)

metric: m_y=1
Standard bounds [-0.926, 0.128]
Delta bounds: [-0.846, -0.0873]
Oracle: -0.1429

metric: m_y=0
Standard bounds [-0.881, 0.319]
Delta bounds: [-0.7, 0.0365]
Oracle: -0.1818

metric: m_a=0
Standard bounds [-0.741, 0.705]
Delta bounds: [-0.234, 0.2]
Oracle: 0.0127

metric: m_a=1
Standard bounds [-0.429, 0.0777]
Delta bounds: [-0.432, 0.0777]
Oracle: 0.03478

metric: m_u
Standard bounds [-0.587, 0.427]
Delta bounds: [-0.233, 0.0729]
Oracle: 2.776e-17

metric: m_y=1
Standard bounds [-0.924, 0.196]
Delta bounds: [-0.846, -0.025]
Oracle: -0.1429

metric: m_y=0
Standard bounds [-0.877, 0.378]
Delta bounds: [-0.7, 0.15]
Oracle: -0.1818

metric: m_a=0
Standard bounds [-0.799, 0.682]
Delta bounds: [-0.314, 0.2]
Oracle: 0.0127

metric: m_a=1
Standard bounds [-0.429, 0.235]
Delta bounds: [-0.432, 0.235]
Oracle: 0.03478

metric: m_u
Standard bounds [-0.618, 0.458]
Delta bounds: [-0.28, 0.12]
Oracle: 2.776e-17


metric: m_y=1
Standard bounds [-0.568, 0.0158]
Delta bounds: [-0.489, -0.11

In [33]:
probs

{'p_mu1': array([0.4975586 , 0.57100876, 0.59555741, 0.49840996, 0.49762377,
        0.54078312, 0.47856062, 0.51517908, 0.48987039, 0.51231657,
        0.50715271, 0.52288778, 0.48216423, 0.49211392, 0.48236522,
        0.52973043, 0.48739307, 0.50555004, 0.47376474, 0.55165621,
        0.48505285, 0.48758302, 0.54375009, 0.51688132, 0.49392194,
        0.49881689, 0.49629645, 0.5023215 , 0.46947764, 0.5406683 ,
        0.50930167, 0.50356413, 0.49687256, 0.49625419, 0.50298396,
        0.50905725, 0.48835704, 0.54062483, 0.51536851, 0.49415371,
        0.52456058, 0.48188737, 0.49394582, 0.51704629, 0.54129685,
        0.4933949 , 0.50683641, 0.54113672, 0.49479823, 0.52170422]),
 'p_e1': array([0.60895634, 0.47770179, 0.26384554, 0.43468641, 0.46439725,
        0.56138552, 0.49494227, 0.57804566, 0.57136588, 0.47461194,
        0.48848152, 0.57876857, 0.48594435, 0.56882768, 0.46745087,
        0.25389461, 0.53905258, 0.42118473, 0.55143154, 0.31148392,
        0.54190337, 0.5921714

## Finite sample estimation


In [10]:

XU, D, Y, Z, pY, pD = data['XU'], data['D'], data['Y'], data['Y'], data['pY'], data['pD']




ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [11]:
XU.shape

(50, 3)

In [12]:
Z.shape



(50,)

In [13]:
XU, D, Y, Z, pY, pD = data['XU'], data['D'], data['Y'], data['Y'], data['pY'], data['pD']

XUZ = np.concatenate((XU, Z.reshape(-1,1)), axis=1)

mu_hat = LogisticRegression()
e1_hat = LogisticRegression()
ez_hat = LogisticRegression()
muz_hat = LogisticRegression()

mu_hat.fit(XU, Y)
e1_hat.fit(XU, D)
ez_hat.fit

p_mu1_hat = mu_hat.predict_proba(XU)[:,1]
p_e1_hat = e1_hat.predict_proba(XU)[:,1]



# pe_hat = 

In [23]:
    return {
        'p_mu1': p_mu1,
        'p_e1': p_e1,
        'p_mu1_z': p_mu1_z,
        'p_e1_z': p_e1_z
    }

-0.014149477505380634

In [24]:
pY

array([0.47357168, 0.62404155, 0.93283826, 0.79501904, 0.7126454 ,
       0.47285323, 0.63861425, 0.51476193, 0.50534012, 0.64988553,
       0.63966823, 0.46938945, 0.67765213, 0.4865287 , 0.71084364,
       0.94426088, 0.55155514, 0.80261808, 0.5982219 , 0.90509897,
       0.60608364, 0.47242543, 0.8024547 , 0.70787356, 0.59552894,
       0.8856158 , 0.65444172, 0.5999661 , 0.47900221, 0.88987209,
       0.88258746, 0.74870865, 0.58334722, 0.64242389, 0.54243127,
       0.57131945, 0.34318459, 0.33953417, 0.76702862, 0.52040196,
       0.58838318, 0.61773199, 0.49942016, 0.56151938, 0.7298703 ,
       0.46971805, 0.85863666, 0.5404549 , 0.64929197, 0.45288775])

In [16]:
(predictions - pY).mean()

ValueError: operands could not be broadcast together with shapes (10000,2) (10000,) 