In [1]:
import numpy as np
import pandas as pd
from numpy.random import permutation

with np.load('jobs_DW_bin.new.10.train.npz') as f:
    train_data = {key:f[key] for key in f}
    
with np.load('jobs_DW_bin.new.10.test.npz') as f:
    test_data = {key:f[key] for key in f}

Dataset: 
- x: covariates
- t: treatment
- yf: factual outcome
- e: original randoimzed sample
- N = 2490 + 297 + 425 = 3,212

In [2]:
# Assert no records were treated in the original sample
assert (((train_data['e'][:,0] == 0) & (train_data['t'][:,0] == 1)).sum() == 0)
assert (((test_data['e'][:,0] == 0) & (test_data['t'][:,0] == 1)).sum() == 0)


In [5]:
ATE = train_data['ate'][0]
print(f"Average treatment effect: {ATE[0]:.4}")
print(f"N (train/test): {train_data['t'][:,0].shape[0]}/{test_data['t'][:,0].shape[0]}")
print(f"Treatment rate (train/test): {train_data['t'][:,0].mean():.2}/{test_data['t'][:,0].mean():.2}")
print(f"Employment rate (train/test): {train_data['yf'][:,0].mean():.2}/{test_data['yf'][:,0].mean():.2}")
# print(f'replicated ate: {ate_replicate:.2}')

Average treatment effect: 0.07794
N (train/test): 2570/642
Treatment rate (train/test): 0.092/0.093
Employment rate (train/test): 0.85/0.84


## Sanity check
- This analysis checks out. 7.8% more peopple who did not receive job training are unemployed at the end of the study 

In [67]:
YS = np.concatenate((train_data['yf'][:,0], test_data['yf'][:,0]), axis=0) # Positve outcome: unemployment (15%)
D = np.concatenate((train_data['t'][:,0], test_data['t'][:,0]), axis=0)
E = np.concatenate((train_data['e'][:,0], test_data['e'][:,0]), axis=0)

print(f'{((E==1) & (D==1)).sum()} treated, {((E==1) & (D==0)).sum()} control')
ate_replicate = YS[(E==1) & (D==1)].mean() - YS[(E==1) & (D==0)].mean()
print(f'replicated ate: {ate_replicate:.2}')

297 treated, 425 control
replicated ate: 0.078


In [68]:
YS[(E==1) & (D==1)].mean()

0.7744107744107744

In [70]:
YS[(D==0)].mean()

0.8576329331046312

In [64]:
YS[(E==1) & (D==0)].mean()

0.6964705882352941

In [61]:

YS[D==1].mean()-YS[D==0].mean()

# 

# -


-0.08322215869385674

In [58]:
error_params = {
    'alpha_0': .4,
    'alpha_1': .2,
    'beta_0': .05,
    'beta_1': .1
}

In [79]:
X, Y = generate_jobs_data(error_params)

In [None]:
train_data[k]

dataset_y = {
    'YS': YS,
    'YS_0': YS_0,
    'YS_1': YS_1,
    'Y_0': Y_0,
    'Y_1': Y_1,
    'Y': Y,
    'pD': np.ones_like(D) * D.mean(),
    'D': D
}

In [30]:
for k, v in train_data.items():
    if len(train_data[k].shape) == 2:
        train_data[k] = train_data[k][:, 0] 
        
    if len(train_data[k].shape) == 3:
        train_data[k] = train_data[k][:,:, 0]
        
    print(k)
    print(train_data[k].shape)
        

ate
(1,)
e
(2570,)
I
(2570,)
yadd
(1,)
yf
(2570,)
t
(2570,)
x
(2570, 17)
ymul
(1,)


In [26]:
train_data

{'ate': array([0.07794019]),
 'e': array([0., 0., 0., ..., 1., 1., 1.]),
 'I': array([[   2,    0,    0, ...,    0,    1,    1],
        [   3,    1,    1, ...,    1,    2,    2],
        [   4,    2,    2, ...,    2,    3,    3],
        ...,
        [3208, 3209, 3208, ..., 3208, 3209, 3208],
        [3210, 3210, 3209, ..., 3210, 3210, 3209],
        [3211, 3211, 3210, ..., 3211, 3211, 3211]], dtype=int32),
 'yadd': array([[0]], dtype=uint8),
 'yf': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 't': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 1., 1., 0.],
        [0., 0., 1., ..., 0., 0., 0.]]),
 'x': array([[[ 1.07934372,  1.36161469,  1

In [8]:




train_data['e'] = train_data['e'][:,0]


In [9]:
train_data['e']

array([0., 0., 0., ..., 1., 1., 1.])

In [3]:
test_data['t'].shape

(642, 10)

In [25]:
test_data['t'].mean()

print(train_data['t'].sum() + test_data['t'].sum())
print((train_data['t'] ==0).sum() + (test_data['t']==0).sum())


2970.0
29150


In [27]:
test_data['t'].shape

(642, 10)