In [1]:
import econml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
from econml.dr import DRLearner

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression

import plotly.express as px

from sklearn.metrics import mean_squared_error
import sklearn.preprocessing
from scipy.stats import pearsonr


# Sim 1

In [2]:
def scm(n_environments:int=10, n_samples:int=100, train=True, seed = None):
    
    np.random.seed(seed)
    n_total = n_environments * n_samples
    N = np.random.normal(0.0, 1.0, (n_total , 5))
    N_X1, N_X2, N_X3, N_X4, N_Y  =  (N[:,0], N[:,1], N[:,2], N[:,3], N[:,4])
    X1 =  N_X1
    X4 =  N_X4
    T = np.random.binomial(1, 0.5, n_total)
    Y_obs = X1 + X4 + T + 4 * X1 * T + X4 * T + N_Y
    Y1 = X1 + X4 + 1 + 4 * X1 * 1 + X4 * 1 + N_Y
    Y0 = X1 + X4 + 0 + 4 * X1 * 0 + X4 * 0 +  N_Y
    env = np.repeat(range(n_environments), n_samples)
    if train:
        delta = np.repeat(np.random.uniform(0.0, 1.0, n_environments), n_samples)
        X2 = Y_obs + N_X2
    else:
        delta = np.repeat(np.random.uniform(0.0, 1.0), n_total)
        X2 = - Y_obs + N_X2
    X3 = Y_obs + N_X3
    
    df = pd.DataFrame.from_dict({'env': env, 
                                 'delta' : delta,
                                 'X1' : X1,
                                 'X2' : X2,
                                 'X3' : X3,
                                 'X4' : X4,
                                 'T' : T,
                                 'Y_obs' : Y_obs,
                                 'Y1' : Y1,
                                 'Y0' : Y0
                                 })
    
    return df


df1 = scm(seed=3, train=True)
df2 = scm(seed=3, train=False)
#print(pearsonr(df1['X1'].values, df1['X2'].values)[0])
#print(pearsonr(df2['X1'].values, df2['X2'].values)[0])

#print(pearsonr(df1['X2'].values, df1['X3'].values)[0])
#print(pearsonr(df2['X2'].values, df2['X3'].values)[0])

#print(pearsonr(df1['X2'].values, (df1['Y1']-df1['Y0']).values)[0])
#print(pearsonr(df2['X2'].values, (df2['Y1']-df2['Y0']).values)[0])


def create_interactions(X: np.ndarray, T: np.ndarray, one_hot_labeler=None) -> tuple:
    if one_hot_labeler is None:
        lb_fit = sklearn.preprocessing.LabelBinarizer().fit(T)
    else:
        lb_fit = one_hot_labeler
    T = lb_fit.transform(T)
    XT = np.zeros(shape=[X.shape[0], X.shape[1] * T.shape[1]]) * np.nan
    cnt = 0
    for i in range(X.shape[1]):
        for j in range(T.shape[1]):
            XT[:,cnt]= X[:, i] * T[:, j]
            cnt += 1
    X_full = np.column_stack((X, T, XT))
    return X_full, lb_fit


def sim(B=100, model=1, seed=42):
    """
    B = Number of repetitions
    model =
    seed = random seed
    
    """ 
    mse_ID = np.empty([B])
    mse_OOD = np.empty([B])

    np.random.seed(seed)
    r = [np.random.randint(1,10000) for _ in range(B)]
       
    for b in range(B):

        train_val = scm(n_environments=10, n_samples=1000, train=True, seed = r[b])
        test = scm(n_environments=10, n_samples=1000, train=False, seed = r[b]+1)

        train, val = train_test_split(train_val, test_size=0.5)
        
        if model == 1:
            X_train = train[['X4']].values
            X_val = val[['X4']].values
            X_test = test[['X4']].values
        elif model == 2:
            X_train = train[['X4']].values
            X_val = val[['X4']].values
            X_test = test[['X4']].values
        elif model == 3:
            X_train = train[['X2', 'X4']].values
            X_val = val[['X2','X4']].values
            X_test = test[['X2','X4']].values
                 
        T_train = train[['T']].values.squeeze()
        Y_train = train[['Y_obs']].values.squeeze()    

        XT_train, lb_fit = create_interactions(X_train, T_train)
        
        m = LinearRegression().fit(XT_train, Y_train)
        
        XT1_val, _ = create_interactions(X_val, np.repeat(1, X_val.shape[0]), lb_fit)
        XT0_val, _ = create_interactions(X_val, np.repeat(0, X_val.shape[0]), lb_fit)
         
        XT1_test, _ = create_interactions(X_test, np.repeat(1, X_test.shape[0]), lb_fit)
        XT0_test, _ = create_interactions(X_test, np.repeat(0, X_test.shape[0]), lb_fit)
        
        cate_pred_ID  = m.predict(XT1_val) - m.predict(XT0_val)
        cate_actual_ID = val[['Y1']].values.squeeze()- val[['Y0']].values.squeeze()   
        mse_ID[b] = mean_squared_error(cate_actual_ID, cate_pred_ID)

        cate_pred_OOD = m.predict(XT1_test) - m.predict(XT0_test)
        cate_actual_OOD = test[['Y1']].values.squeeze()- test[['Y0']].values.squeeze()   
        mse_OOD[b] = mean_squared_error(cate_actual_OOD, cate_pred_OOD)
            
    return mse_ID, mse_OOD


res_model1 = sim(B=100, model=1)
res_model2 = sim(B=100, model=2)
res_model3 = sim(B=100, model=3)

print(np.mean(res_model1[0]))
print(np.mean(res_model2[0]))
print(np.mean(res_model3[0]))

print(np.mean(res_model1[1]))
print(np.mean(res_model2[1]))
print(np.mean(res_model3[1]))

16.085574011421965
16.085574011421965
11.57676624356803
15.999051085000955
15.999051085000955
28.72825527116168


# Sim2

In [4]:
def scm(n_environments:int=10, n_samples:int=100, train=True, seed = None):
    
    np.random.seed(seed)
    n_total = n_environments * n_samples
    N = np.random.normal(0.0, 1.0, (n_total , 5))
    N_X1, N_X2, N_X3, N_X4, N_Y  =  (N[:,0], N[:,1], N[:,2], N[:,3], N[:,4])
    X1 =  N_X1
    X4 =  N_X4
    T = np.random.binomial(1, 0.5, n_total)
    Y_obs = X1 + X4 + T + 4 * X1 * T + X4 * T + N_Y
    Y1 = X1 + X4 + 1 + 4 * X1 * 1 + X4 * 1 + N_Y
    Y0 = X1 + X4 + 0 + 4 * X1 * 0 + X4 * 0 +  N_Y
    env = np.repeat(range(n_environments), n_samples)
    if train:
        delta = np.repeat(np.random.uniform(0.0, 1.0, n_environments), n_samples)
        X2 = Y_obs + N_X2
    else:
        delta = np.repeat(np.random.uniform(0.0, 1.0), n_total)
        X2 = - Y_obs + N_X2
    X3 = 0.1 * Y_obs + N_X3
    
    df = pd.DataFrame.from_dict({'env': env, 
                                 'delta' : delta,
                                 'X1' : X1,
                                 'X2' : X2,
                                 'X3' : X3,
                                 'X4' : X4,
                                 'T' : T,
                                 'Y_obs' : Y_obs,
                                 'Y1' : Y1,
                                 'Y0' : Y0
                                 })
    
    return df


df1 = scm(seed=3, train=True)
df2 = scm(seed=3, train=False)
#print(pearsonr(df1['X1'].values, df1['X2'].values)[0])
#print(pearsonr(df2['X1'].values, df2['X2'].values)[0])

#print(pearsonr(df1['X2'].values, df1['X3'].values)[0])
#print(pearsonr(df2['X2'].values, df2['X3'].values)[0])

#print(pearsonr(df1['X2'].values, (df1['Y1']-df1['Y0']).values)[0])
#print(pearsonr(df2['X2'].values, (df2['Y1']-df2['Y0']).values)[0])


def create_interactions(X: np.ndarray, T: np.ndarray, one_hot_labeler=None) -> tuple:
    if one_hot_labeler is None:
        lb_fit = sklearn.preprocessing.LabelBinarizer().fit(T)
    else:
        lb_fit = one_hot_labeler
    T = lb_fit.transform(T)
    XT = np.zeros(shape=[X.shape[0], X.shape[1] * T.shape[1]]) * np.nan
    cnt = 0
    for i in range(X.shape[1]):
        for j in range(T.shape[1]):
            XT[:,cnt]= X[:, i] * T[:, j]
            cnt += 1
    X_full = np.column_stack((X, T, XT))
    return X_full, lb_fit


def sim(B=100, model=1, seed=42):
    """
    B = Number of repetitions
    model =
    seed = random seed
    
    """ 
    mse_ID = np.empty([B])
    mse_OOD = np.empty([B])

    np.random.seed(seed)
    r = [np.random.randint(1,10000) for _ in range(B)]
       
    for b in range(B):

        train_val = scm(n_environments=10, n_samples=1000, train=True, seed = r[b])
        test = scm(n_environments=10, n_samples=1000, train=False, seed = r[b]+1)

        train, val = train_test_split(train_val, test_size=0.5)
        
        if model == 1:
            X_train = train[['X4']].values
            X_val = val[['X4']].values
            X_test = test[['X4']].values
        elif model == 2:
            X_train = train[['X3','X4']].values
            X_val = val[['X3','X4']].values
            X_test = test[['X3','X4']].values
        elif model == 3:
            X_train = train[['X2', 'X3', 'X4']].values
            X_val = val[['X2', 'X3', 'X4']].values
            X_test = test[['X2', 'X3', 'X4']].values
                 
        T_train = train[['T']].values.squeeze()
        Y_train = train[['Y_obs']].values.squeeze()    

        XT_train, lb_fit = create_interactions(X_train, T_train)
        
        m = LinearRegression().fit(XT_train, Y_train)
        
        XT1_val, _ = create_interactions(X_val, np.repeat(1, X_val.shape[0]), lb_fit)
        XT0_val, _ = create_interactions(X_val, np.repeat(0, X_val.shape[0]), lb_fit)
         
        XT1_test, _ = create_interactions(X_test, np.repeat(1, X_test.shape[0]), lb_fit)
        XT0_test, _ = create_interactions(X_test, np.repeat(0, X_test.shape[0]), lb_fit)
        
        cate_pred_ID  = m.predict(XT1_val) - m.predict(XT0_val)
        cate_actual_ID = val[['Y1']].values.squeeze()- val[['Y0']].values.squeeze()   
        mse_ID[b] = mean_squared_error(cate_actual_ID, cate_pred_ID)

        cate_pred_OOD = m.predict(XT1_test) - m.predict(XT0_test)
        cate_actual_OOD = test[['Y1']].values.squeeze()- test[['Y0']].values.squeeze()   
        mse_OOD[b] = mean_squared_error(cate_actual_OOD, cate_pred_OOD)
            
    return mse_ID, mse_OOD


res_model1 = sim(B=100, model=1)
res_model2 = sim(B=100, model=2)
res_model3 = sim(B=100, model=3)

print(np.mean(res_model1[0]))
print(np.mean(res_model2[0]))
print(np.mean(res_model3[0]))

print(np.mean(res_model1[1]))
print(np.mean(res_model2[1]))
print(np.mean(res_model3[1]))

16.085574011421965
15.621090083164471
11.61072225127948
15.999051085000955
15.521776961096991
28.449680147791536


# Sim3

In [7]:
def scm(n_environments:int=10, n_samples:int=100, train=True, seed = None):
    
    np.random.seed(seed)
    n_total = n_environments * n_samples
    N = np.random.normal(0.0, 1.0, (n_total , 5))
    N_X1, N_X2, N_X3, N_X4, N_Y  =  (N[:,0], N[:,1], N[:,2], N[:,3], N[:,4])
    X1 =  N_X1
    X4 =  N_X4
    T = np.random.binomial(1, 0.5, n_total)
    Y_obs = X1 + X4 + T + 4 * X1 * T + X4 * T + N_Y
    Y1 = X1 + X4 + 1 + 4 * X1 * 1 + X4 * 1 + N_Y
    Y0 = X1 + X4 + 0 + 4 * X1 * 0 + X4 * 0 +  N_Y
    env = np.repeat(range(n_environments), n_samples)
    if train:
        delta = np.repeat(np.random.uniform(0.0, 1.0, n_environments), n_samples)
        X2 = Y_obs + N_X2
    else:
        delta = np.repeat(np.random.uniform(0.0, 1.0), n_total)
        X2 = - Y_obs + N_X2
    X3 = 0.1 * Y_obs + N_X3
    
    df = pd.DataFrame.from_dict({'env': env, 
                                 'delta' : delta,
                                 'X1' : X1,
                                 'X2' : X2,
                                 'X3' : X3,
                                 'X4' : X4,
                                 'T' : T,
                                 'Y_obs' : Y_obs,
                                 'Y1' : Y1,
                                 'Y0' : Y0
                                 })
    
    return df


df1 = scm(seed=3, train=True)
df2 = scm(seed=3, train=False)
#print(pearsonr(df1['X1'].values, df1['X2'].values)[0])
#print(pearsonr(df2['X1'].values, df2['X2'].values)[0])

#print(pearsonr(df1['X2'].values, df1['X3'].values)[0])
#print(pearsonr(df2['X2'].values, df2['X3'].values)[0])

#print(pearsonr(df1['X2'].values, (df1['Y1']-df1['Y0']).values)[0])
#print(pearsonr(df2['X2'].values, (df2['Y1']-df2['Y0']).values)[0])

def sim(B=100, model=1, seed=42,
         reg = lambda: LinearRegression(),
         #reg = lambda: RandomForestRegressor(min_samples_leaf=10)
       ):
    """
    B = Number of repetitions
    model =
    seed = random seed
    
    """ 
    mse_ID = np.empty([B, 3])
    mse_OOD = np.empty([B, 3])

    np.random.seed(seed)
    r = [np.random.randint(1,10000) for _ in range(B)]
       
    for b in range(B):

        train_val = scm(n_environments=10, n_samples=1000, train=True, seed = r[b])
        test = scm(n_environments=10, n_samples=1000, train=False, seed = r[b]+1)

        train, val = train_test_split(train_val, test_size=0.5)
        
        if model == 1:
            X_train = train[['X4']].values
            X_val = val[['X4']].values
            X_test = test[['X4']].values
        elif model == 2:
            X_train = train[['X3','X4']].values
            X_val = val[['X3','X4']].values
            X_test = test[['X3','X4']].values
        elif model == 3:
            X_train = train[['X2', 'X3', 'X4']].values
            X_val = val[['X2', 'X3', 'X4']].values
            X_test = test[['X2', 'X3', 'X4']].values
                
        T_train = train[['T']].values.squeeze()
        Y_train = train[['Y_obs']].values.squeeze()              

        models = [('xlearner', XLearner(models=reg())),
                  #('slearner', SLearner(overall_model=reg())),
                  ('slearner', DomainAdaptationLearner(models=reg(), final_models=reg())),
                  ('tlearner', TLearner(models=reg()))]

        def fit_model(name, model):
            return name, model.fit(Y_train, T_train, X=X_train)

        models = Parallel(n_jobs=-1, verbose=1)(delayed(fit_model)(name, mdl) for name, mdl in models)

        for i in range(len(models)):
            cate_pred_ID = models[i][1].effect(X_val)
            cate_actual_ID = val[['Y1']].values.squeeze()- val[['Y0']].values.squeeze()   
            mse_ID[b,i] = mean_squared_error(cate_actual_ID, cate_pred_ID)


            cate_actual_OOD = test[['Y1']].values.squeeze()- test[['Y0']].values.squeeze()   
            cate_pred_OOD = models[i][1].effect(X_test)
            mse_OOD[b,i] = mean_squared_error(cate_actual_OOD, cate_pred_OOD)
            
            
    return mse_ID, mse_OOD


res_model1 = sim(B=100, model=1)
res_model2 = sim(B=100, model=2)
res_model3 = sim(B=100, model=3)

print("model1 - ID", res_model1[0])
print("model2 - ID", res_model2[0])
print("model3 - ID", res_model3[0])


df_ID1 = pd.DataFrame.from_dict({'XLearner':res_model1[0][:,0],
                                 'SLearner':res_model1[0][:,1],
                                 'TLearner':res_model1[0][:,2], 
                                 'Model': 1,
                               })


df_ID2 = pd.DataFrame.from_dict({
                                'XLearner':res_model2[0][:,0],
                                'SLearner':res_model2[0][:,1],
                                'TLearner':res_model2[0][:,2],   
                                'Model': 2,
                               })

df_ID3 = pd.DataFrame.from_dict({
                                'XLearner':res_model3[0][:,0],
                                'SLearner':res_model3[0][:,1],
                                'TLearner':res_model3[0][:,2],   
                                'Model': 3,
                               })


df_ID = pd.concat([df_ID1, df_ID2, df_ID3], axis= 0)

df_ID = pd.melt(df_ID, id_vars=['Model'], var_name='Learner', value_name='CATE_MSE')


df_OOD1 = pd.DataFrame.from_dict({'XLearner':res_model1[1][:,0],
                                  'SLearner':res_model1[1][:,1],
                                  'TLearner':res_model1[1][:,2],
                                  'Model': 1,
                               })

df_OOD2 = pd.DataFrame.from_dict({'XLearner':res_model2[1][:,0],
                                  'SLearner':res_model2[1][:,1],
                                  'TLearner':res_model2[1][:,2],
                                  'Model': 2,
                               })

df_OOD3 = pd.DataFrame.from_dict({'XLearner':res_model3[1][:,0],
                                  'SLearner':res_model3[1][:,1],
                                  'TLearner':res_model3[1][:,2],
                                  'Model': 3,
                               })

df_OOD = pd.concat([df_OOD1, df_OOD2, df_OOD3], axis= 0)

df_OOD = pd.melt(df_OOD, id_vars=['Model'], var_name='Learner', value_name='CATE_MSE')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

model1 - ID [[16.17009774 16.16891047 16.17009774]
 [16.0841369  16.08417464 16.0841369 ]
 [16.25025875 16.25028164 16.25025875]
 [16.30430336 16.30548861 16.30430336]
 [15.80782348 15.80854631 15.80782348]
 [15.94936942 15.9493317  15.94936942]
 [16.16984592 16.16988452 16.16984592]
 [16.00018676 16.00029912 16.00018676]
 [16.13099389 16.13194743 16.13099389]
 [16.13207185 16.13195633 16.13207185]
 [16.21662725 16.21668236 16.21662725]
 [15.84325825 15.84346354 15.84325825]
 [16.22741976 16.22685347 16.22741976]
 [16.0262651  16.02832128 16.0262651 ]
 [16.43700144 16.43702694 16.43700144]
 [16.64545075 16.64547315 16.64545075]
 [15.789794   15.78981277 15.789794  ]
 [15.55916524 15.55917914 15.55916524]
 [16.03994194 16.03994842 16.03994194]
 [16.38862749 16.38857103 16.38862749]
 [16.08719973 16.08721861 16.08719973]
 [16.45277949 16.45282604 16.45277949]
 [16.21536346 16.2159863  16.21536346]
 [16.1030354  16.10305634 16.1030354 ]
 [16.68566413 16.68579145 16.68566413]
 [16.32726189

In [8]:

fig = px.box(df_ID, x="Learner", y="CATE_MSE", color="Model", labels={'Learner':""})
fig.update_traces() # or "inclusive", or "linear" by default
fig.show()





In [9]:

fig = px.box(df_OOD, x="Learner", y="CATE_MSE", color="Model", labels={'Learner':""})
fig.update_traces() # or "inclusive", or "linear" by default
fig.show()

# Sim 4 - best so far

In [None]:
def scm(n_environments:int=10, n_samples:int=100, train=True, seed = None):
    
    np.random.seed(seed)
    n_total = n_environments * n_samples
    N = np.random.normal(0.0, 1.0, (n_total , 5))
    N_X1, N_X2, N_X3, N_X4, N_Y  =  (N[:,0], N[:,1], N[:,2], N[:,3], N[:,4])
    X1 =  N_X1
    X4 =  N_X4
    T = np.random.binomial(1, 0.5, n_total)
    #Y_obs = X1 + X4 + T + 4 * X1 * T + X4 * T + N_Y
    #Y1 = X1 + X4 + 1 + 4 * X1 * 1 + X4 * 1 + N_Y
    #Y0 = X1 + X4 + 0 + 4 * X1 * 0 + X4 * 0 +  N_Y
    
    Y_obs = X1 + X4 + T + 1.5 * X1 * T + 0.5 * X4 * T + N_Y
    Y1 = X1 + X4 + 1 + 1.5 * X1 * 1 + 0.5 * X4 * 1 + N_Y
    Y0 = X1 + X4 + 0 + 1.5 * X1 * 0 + X4 * 0 +  N_Y
    env = np.repeat(range(n_environments), n_samples)
    if train:
        delta = np.repeat(np.random.uniform(0.0, 1.0, n_environments), n_samples)
        X2 = delta * Y_obs + N_X2
    else:
        #delta = np.repeat(np.random.uniform(0.0, 1.0), n_total)
        #X2 = - delta * Y_obs + N_X2
        delta = np.repeat(np.random.uniform(-1.0, 1.0), n_total)
        X2 = delta * Y_obs + N_X2
    X3 = 0.3 * Y_obs + N_X3
    
    df = pd.DataFrame.from_dict({'env': env, 
                                 'delta' : delta,
                                 'X1' : X1,
                                 'X2' : X2,
                                 'X3' : X3,
                                 'X4' : X4,
                                 'T' : T,
                                 'Y_obs' : Y_obs,
                                 'Y1' : Y1,
                                 'Y0' : Y0
                                 })
    
    return df


df1 = scm(seed=3, train=True)
df2 = scm(seed=3, train=False)
#print(pearsonr(df1['X1'].values, df1['X2'].values)[0])
#print(pearsonr(df2['X1'].values, df2['X2'].values)[0])

#print(pearsonr(df1['X2'].values, df1['X3'].values)[0])
#print(pearsonr(df2['X2'].values, df2['X3'].values)[0])

#print(pearsonr(df1['X2'].values, (df1['Y1']-df1['Y0']).values)[0])
#print(pearsonr(df2['X2'].values, (df2['Y1']-df2['Y0']).values)[0])

def sim(B=100, model=1, seed=42,
         reg = lambda: LinearRegression(),
         #reg = lambda: RandomForestRegressor(min_samples_leaf=10)
       ):
    """
    B = Number of repetitions
    model =
    seed = random seed
    
    """ 
    mse_ID = np.empty([B, 3])
    mse_OOD = np.empty([B, 3])

    np.random.seed(seed)
    r = [np.random.randint(1,10000) for _ in range(B)]
       
    for b in range(B):

        train_val = scm(n_environments=10, n_samples=1000, train=True, seed = r[b])
        test = scm(n_environments=10, n_samples=1000, train=False, seed = r[b]+1)

        train, val = train_test_split(train_val, test_size=0.5)
        
        if model == 1:
            X_train = train[['X4']].values
            X_val = val[['X4']].values
            X_test = test[['X4']].values
        elif model == 2:
            X_train = train[['X3','X4']].values
            X_val = val[['X3','X4']].values
            X_test = test[['X3','X4']].values
        elif model == 3:
            X_train = train[['X2', 'X3', 'X4']].values
            X_val = val[['X2', 'X3', 'X4']].values
            X_test = test[['X2', 'X3', 'X4']].values
                
        T_train = train[['T']].values.squeeze()
        Y_train = train[['Y_obs']].values.squeeze()              

        models = [('xlearner', XLearner(models=reg())),
                  #('slearner', SLearner(overall_model=reg())),
                  ('slearner', DomainAdaptationLearner(models=reg(), final_models=reg())),
                  ('tlearner', TLearner(models=reg()))]

        def fit_model(name, model):
            return name, model.fit(Y_train, T_train, X=X_train)

        models = Parallel(n_jobs=-1, verbose=1)(delayed(fit_model)(name, mdl) for name, mdl in models)

        for i in range(len(models)):
            cate_pred_ID = models[i][1].effect(X_val)
            cate_actual_ID = val[['Y1']].values.squeeze()- val[['Y0']].values.squeeze()   
            mse_ID[b,i] = mean_squared_error(cate_actual_ID, cate_pred_ID)


            cate_actual_OOD = test[['Y1']].values.squeeze()- test[['Y0']].values.squeeze()   
            cate_pred_OOD = models[i][1].effect(X_test)
            mse_OOD[b,i] = mean_squared_error(cate_actual_OOD, cate_pred_OOD)
            
            
    return mse_ID, mse_OOD


res_model1 = sim(B=100, model=1)
res_model2 = sim(B=100, model=2)
res_model3 = sim(B=100, model=3)

print("model1 - ID", res_model1[0])
print("model2 - ID", res_model2[0])
print("model3 - ID", res_model3[0])


df_ID1 = pd.DataFrame.from_dict({'XLearner':res_model1[0][:,0],
                                 'SLearner':res_model1[0][:,1],
                                 'TLearner':res_model1[0][:,2], 
                                 'Model': "M1: S",
                               })


df_ID2 = pd.DataFrame.from_dict({
                                'XLearner':res_model2[0][:,0],
                                'SLearner':res_model2[0][:,1],
                                'TLearner':res_model2[0][:,2],   
                                'Model': "M2: S,C (*)",
                               })

df_ID3 = pd.DataFrame.from_dict({
                                'XLearner':res_model3[0][:,0],
                                'SLearner':res_model3[0][:,1],
                                'TLearner':res_model3[0][:,2],   
                                'Model': "M3: S,C,L",
                               })


df_ID = pd.concat([df_ID1, df_ID2, df_ID3], axis= 0)

df_ID = pd.melt(df_ID, id_vars=['Model'], var_name='Learner', value_name='CATE_MSE')


df_OOD1 = pd.DataFrame.from_dict({'XLearner':res_model1[1][:,0],
                                  'SLearner':res_model1[1][:,1],
                                  'TLearner':res_model1[1][:,2],
                                  'Model': "M1: S",
                               })

df_OOD2 = pd.DataFrame.from_dict({'XLearner':res_model2[1][:,0],
                                  'SLearner':res_model2[1][:,1],
                                  'TLearner':res_model2[1][:,2],
                                  'Model': "M2: S,C (*)",
                               })

df_OOD3 = pd.DataFrame.from_dict({'XLearner':res_model3[1][:,0],
                                  'SLearner':res_model3[1][:,1],
                                  'TLearner':res_model3[1][:,2],
                                  'Model': "M3: S,C,L",
                               })

df_OOD = pd.concat([df_OOD1, df_OOD2, df_OOD3], axis= 0)

df_OOD = pd.melt(df_OOD, id_vars=['Model'], var_name='Learner', value_name='CATE_MSE')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
# Model 1 => S
# Model 2 => S, C (Proposed Approach)
# Model 3 => S, C, L

In [None]:
fig = px.box(df_ID, x="Learner", y="CATE_MSE", color="Model", labels={'Learner':""})
fig.update_traces() # or "inclusive", or "linear" by default
fig.update_yaxes(range=[1.7, 5.1])
fig.show()


In [None]:
fig = px.box(df_OOD, x="Learner", y="CATE_MSE", color="Model", labels={'Learner':""})
fig.update_traces() # or "inclusive", or "linear" by default
fig.update_yaxes(range=[1.7, 5.1])
fig.show()