In [1]:
# Code is based in: https://github.com/amerch/causal_inference_evaluation

# Libraries

In [2]:
import os
import numpy as np
from tqdm import tqdm
from scipy.special import expit


# Random generators initialization
import random
seed=42
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
print('[INFO] Random generators were initialized')

[INFO] Random generators were initialized


In [3]:
def get_samples(samples, n, scale=1, sigma_z0=3, sigma_z1=5):
    z = np.random.binomial(1, 0.5, (samples, n))
    x = np.random.normal(z, np.sqrt(sigma_z1 ** 2 * z + sigma_z0 **2 * (1 - z)))
    x = x.reshape((samples, n))
    
    t = np.random.binomial(1, 0.75 * z + 0.25 * (1 - z))
    y = np.random.binomial(1, expit(scale * (z + 4 * t - 2)))
    ycf = np.random.binomial(1, expit(scale * (z + 4 * (1-t) - 2)))
    mu0 =  expit(scale * (z - 2))
    mu1 =  expit(scale * (z + 2))
    
    return x, t[:,0], y[:,0], ycf[:,0], mu0[:,0], mu1[:,0]

In [4]:
# # nProblems = 100
# # n = 100 # Number of Covariates
# # test_sample  = 1000

# x, t, y, ycf, mu0, mu1 = get_samples(10, 3)

# x.shape, t.shape

In [5]:
path = 'Synthetic'
# Number of Covariates
n = 10 # 100 
sample_sizes = [5000]
number_of_problems = 100
test_sample = 1000


for sample in sample_sizes:
    print ('Sample size: ', sample)

    data_path = '{}/{}'.format(path, sample)

    if not os.path.exists( data_path ):
        os.makedirs(data_path)
        print('[INFO] Path: {} created'.format(data_path))
        
        
    for idx in tqdm( range( number_of_problems ) ):
        x, t, y, ycf, mu0, mu1 = get_samples(sample, n)
        np.savez(data_path + '/train{}.npz'.format(idx), x=x, t=t, yf=y, ycf=ycf, mu1=mu1, mu0=mu0)
    
        x, t, y, ycf, mu0, mu1 = get_samples(test_sample, n)
        np.savez(data_path + '/test{}.npz'.format(idx), x=x, t=t, yf=y, ycf=ycf, mu1=mu1, mu0=mu0)

Sample size:  5000


100%|██████████| 100/100 [00:01<00:00, 51.95it/s]
