In [1]:
import numpy as np
import pandas as pd

from benchmarks.synthetic import *

In [2]:
x = np.random.uniform(low=-1, high=1, size=(5000, 2))

eta(x, environment='2D_linsep_baseline').shape

pi(x, func='6cov_linear').shape
    

(5000,)

In [3]:
x = np.random.uniform(low=-1, high=1, size=(5000, 3))
error_params = {
    'alpha_0': 0.4,
    'alpha_1': 0.1,
    'beta_0': 0.05,
    'beta_1': 0.3 
}

syn_sinusoidal = {
    'name': 'synthetic',
    'NS': 5000,
    'config': {
        'function_class': '1D_sinusoidal',
        'Y0_PDF': 'piecewise_sinusoid',
        'Y1_PDF': 'low_base_rate_sinusoid',
        'PI_PDF': 'linear'
    }
}

syn_linsep = {
    'name': 'synthetic',
    'NS': 5000,
    'config': {
        'function_class': '2D_linsep',
        'Y0_PDF': '2D_linsep_baseline',
        'Y1_PDF': '2D_linsep_intervention',
        'PI_PDF': '6cov_linear'
    }
}

syn_shalt = {
    'name': 'synthetic',
    'NS': 5000,
    'config': {
        'function_class': '6D_shalt',
        'Y0_PDF': 'shalt_6cov_baseline',
        'Y1_PDF': 'shalt_6cov_intervention',
        'PI_PDF': '6cov_linear'
    }
}

In [6]:
X, Y = generate_syn_data(syn_shalt, error_params)

X, Y = generate_syn_data(syn_linsep, error_params)

X, Y = generate_syn_data(syn_sinusoidal, error_params)


In [6]:
def generate_syn_data(env, NS, error_params, shuffle=True):  

    # Define class probability functions

    if env == '1D_sinusoidal':
        x = np.random.uniform(low=-1, high=1, size=(NS, 1))
    
    elif env == '2D_linsep':
        x = np.random.uniform(low=0, high=1, size=(NS, 2))

    elif env == '6D_shalt':
        x = np.random.uniform(low=0, high=1, size=(NS, 6))
        
    eta_star_0 = eta(x, environment=env['Y0_PDF'])
    eta_star_1 = eta(x, environment=env['Y1_PDF'])

    # Sample from target potential outcome class probability distributions
    YS_0 = np.random.binomial(1, eta_star_0, size=NS)
    YS_1 = np.random.binomial(1, eta_star_1, size=NS)
    
    Y_0 = np.zeros_like(YS_0)
    Y_1 = np.zeros_like(YS_0)

    alpha_0_errors = np.random.binomial(1, error_params['alpha_0'], size=NS)
    alpha_1_errors = np.random.binomial(1, error_params['alpha_1'], size=NS)
    beta_0_errors = np.random.binomial(1, error_params['beta_0'], size=NS)
    beta_1_errors = np.random.binomial(1, error_params['beta_1'], size=NS)

    Y_0[alpha_0_errors == 1] = 1
    Y_0[beta_0_errors == 1] = 0
    Y_1[alpha_1_errors == 1] = 1
    Y_1[beta_1_errors == 1] = 0

    # Apply consistency assumption to observe potential outcomes
    YS = np.zeros(NS, dtype=np.int64)
    Y = np.zeros_like(Y_0)

    pD = pi(x, func=env['PI_PDF'])
    D = np.random.binomial(1, pD, size=NS)
    YS[D==0] = YS_0[D==0]
    YS[D==1] = YS_1[D==1]

    Y[D==0] = Y_0[D==0]
    Y[D==1] = Y_1[D==1]
        
    dataset_y = {
        'pYS_0': eta_star_0,
        'pYS_1': eta_star_1,
        'YS_0': YS_0,
        'YS_1': YS_1,
        'Y_0': Y_0,
        'Y_1': Y_1,
        'Y': Y,
        'pD': pD,
        'D': D,
        'YS': YS
    }

    X, Y = pd.DataFrame(x), pd.DataFrame(dataset_y)
   
    if shuffle: 
        suffle_ix = permutation(X.index)
        X = X.iloc[suffle_ix]
        Y = Y.iloc[suffle_ix]

    return X, Y

In [39]:
alpha = .5
beta = .2

X, Y, error_params = generate_syn_data(
    NS=30000,
    K=1,
    y0_pdf='piecewise_sinusoid',
    y1_pdf='sinusoid',
    pi_pdf='linear',
    alpha_0=alpha,
    alpha_1=alpha,
    beta_0=beta,
    beta_1=beta,
    shuffle=True)

def loss(y_hat, y):
    return -(y*np.log(y_hat))

def surrogate_loss(py_hat, y):

    phat_y0 = py_hat[y==0]
    phat_y1 = py_hat[y==1]

    y1_losses = ((1-alpha)*loss(phat_y1, np.ones_like(phat_y1)) - beta*loss(phat_y1, np.zeros_like(phat_y1))) / (1-beta-alpha)
    y0_losses = ((1-beta)*loss(phat_y0, np.zeros_like(phat_y0)) - alpha*loss(phat_y0, np.ones_like(phat_y0))) / (1-beta-alpha)
    
    return np.concatenate([y1_losses, y0_losses])



In [40]:
data = {
    'X': X.squeeze(),
    'Ys': Y['YS_0'],
    'Y': Y['Y0_0'],
    'Y_hat': (1-alpha-beta)*Y['pYS_0'] + alpha
}
syndata = pd.DataFrame(data)

syndata['Ys_loss'] = loss(syndata['Y_hat'], syndata['Ys'])
syndata['Ysur_loss'] = surrogate_loss(syndata['Y_hat'], syndata['Y'])


In [41]:
syndata['Ys_loss'].mean()

0.1766150795689253

In [42]:
syndata['Ysur_loss'].mean()

0.08166220625639595

In [11]:
ones = np.ones_like(syndata['Y_hat'])
zeros = np.zeros_like(syndata['Y_hat'])


In [23]:



# .expand_dims()

(1, 30000)

In [33]:

np.concatenate([np.expand_dims(surrogate_loss(syndata['Y_hat'], zeros), axis=1),
                np.expand_dims(surrogate_loss(syndata['Y_hat'], ones), axis=1)], axis=1).mean(axis=1)





array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
surrogate_loss(np.array([.2]), 0)

array([[-2.68239652]])

In [35]:
surrogate_loss(np.array([.2]), 1)

array([[2.68239652]])

In [12]:
surrogate_loss(syndata['Y_hat'], zeros)

array([-0.49777417, -1.06144296, -0.68372321, ..., -0.79064118,
       -0.50783722, -0.58929075])

In [5]:
syndata['Ys_loss'].mean()

0.17914392484114144

In [6]:
syndata['Ysur_loss'].mean()

0.09916401772262792

In [None]:
print(syndata['Ys_loss'][syndata['Ys'] == 1].mean())
print(y1s.mean())

print(syndata['Ys_loss'][syndata['Ys'] == 0].mean())
print(y0s.mean())



In [29]:
print(syndata['Ys_loss'].mean())
print(syndata['Ysur_loss'].mean())



print(syndata['Ysur_loss'][syndata['Y'] == 1].mean())




0.17664500345546946
0.08965007656122444
0.2366096586599707
0.09416733227880483


In [17]:
surrogate_loss(syndata['Y_hat'], syndata['Y']).mean()

0.08693264342603393

In [7]:

print(syndata['Ysur_loss'].mean())

print(syndata['Ys_loss'].mean())


0.08785599832626462
0.17667244402293678


In [104]:
loss(phat_y1, np.ones_like(phat_y1))


836    0.956231
558    0.220192
585    0.225161
980    0.227003
71     0.386832
         ...   
607    0.198884
853    0.048323
594    0.371156
969    1.848414
909    0.148952
Name: Y_hat, Length: 589, dtype: float64

In [105]:
loss(phat_y0, np.ones_like(phat_y0))

642    1.306313
568    2.336598
815    0.226574
884    0.074747
447    2.325891
         ...   
677    2.750130
933    1.134441
393    2.282744
17     0.891121
674    0.708880
Name: Y_hat, Length: 411, dtype: float64

In [78]:



cross_entropy(synthdata['Ys'], synthdata['Y_hat'])



281    1.073466
534    0.000000
835    0.000000
819    0.000000
921    1.153383
         ...   
677    2.542613
112    0.000000
883    0.111959
111    0.000000
272    0.000000
Length: 1000, dtype: float64

361.2209508827656

In [17]:
pd.concat([X, ], ignore_index=True)



Unnamed: 0,0,YS_0,Y0
0,0.319736,,
1,0.804279,,
2,0.766431,,
3,-0.527925,,
4,0.679959,,
...,...,...,...
1995,,0.0,0.0
1996,,0.0,0.0
1997,,1.0,1.0
1998,,0.0,0.0


In [15]:
X

Unnamed: 0,0
618,0.319736
601,0.804279
513,0.766431
470,-0.527925
297,0.679959
...,...
463,-0.843725
508,-0.396933
781,-0.529049
140,-0.341898


In [13]:

pd.horzcat(X, )

AttributeError: module 'pandas' has no attribute 'horzcat'

In [7]:
X

Unnamed: 0,0
618,0.319736
601,0.804279
513,0.766431
470,-0.527925
297,0.679959
...,...
463,-0.843725
508,-0.396933
781,-0.529049
140,-0.341898
