In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from ipw_estimator import IPWATEEstimator
from sklearn.naive_bayes import GaussianNB

In [2]:
def load_df_dict_complete(path, threshold=None):
    """
    Loads dict of X_train, X_test, T_train, T_test, Y_train, Y_test dataframes.
    If threshold is not None, binarize the treatment variable with the threshold.
    :param path: path to .pkl
    :param threshold: threshold to binarize the treatment variable
    :return: data dict
    """
    data = pd.read_pickle(path)
    
    if threshold:
        data['T1'] = data['T1'].apply(lambda x: 0 if x <= threshold else 1)
        data['T2'] = data['T2'].apply(lambda x: 0 if x <= threshold else 1)
    
    return data

In [3]:
# Logistic regression was a relatively consistent model for propensity score estimation. 

# nb_model = GaussianNB()
# model = Pipeline([
#     ('scaler', StandardScaler()),
#     ('logistic_regression', LogisticRegression(C=1, max_iter=1000))
# ])
model = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic_regression', SVC(probability=True, C=10))
])

# Mature no children

In [4]:
# load preprocessed data
path = './preprocessed_data/df_complete_mature_no_children.pickle'

mnc_data = load_df_dict_complete(path, threshold=2)


## Expected #

In [5]:
mnc_exp_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Mature No Children, Treatment = Expected #")
mnc_exp_ps_estimator.estimate_ATE(mnc_data['X'], mnc_data['Y'], mnc_data['T1'])

Mature No Children, Treatment = Expected #


Bootstrapping: 100%|██████████| 1000/1000 [09:00<00:00,  1.85it/s]


(0.2419327360602756, array([-0.203238  ,  0.68776659]))

## Ideal

In [6]:
mnc_id_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Mature No Children, Treatment = Ideal #")
mnc_id_ps_estimator.estimate_ATE(mnc_data['X'], mnc_data['Y'], mnc_data['T2'])

Mature No Children, Treatment = Ideal #


Bootstrapping: 100%|██████████| 1000/1000 [09:21<00:00,  1.78it/s]


(0.20643737648960342, array([-0.26651913,  0.63089475]))

# Mature with children

In [7]:
# load preprocessed data
path = './preprocessed_data/df_complete_mature_with_children.pickle'

mwc_data = load_df_dict_complete(path, threshold=2)


## Expected

In [8]:
mwc_exp_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Mature With Children, Treatment = Expected #")
mwc_exp_ps_estimator.estimate_ATE(mwc_data['X'], mwc_data['Y'], mwc_data['T1'])

Mature With Children, Treatment = Expected #


Bootstrapping: 100%|██████████| 1000/1000 [01:40<00:00,  9.95it/s]


(0.4447326075278527, array([-0.32820222,  1.19365662]))

## Ideal

In [9]:
mwc_id_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Mature With Children, Treatment = Ideal #")
mwc_id_ps_estimator.estimate_ATE(mwc_data['X'], mwc_data['Y'], mwc_data['T2'])

Mature With Children, Treatment = Ideal #


Bootstrapping: 100%|██████████| 1000/1000 [01:37<00:00, 10.30it/s]


(0.4458674319213451, array([-0.35582123,  1.29015426]))

# Young no children

In [10]:
# load preprocessed data
path = './preprocessed_data/df_complete_young_no_children.pickle'

ync_data = load_df_dict_complete(path, threshold=2)


## Expected

In [11]:
ync_exp_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Young No Children, Treatment = Expected #")
ync_exp_ps_estimator.estimate_ATE(ync_data['X'], ync_data['Y'], ync_data['T1'])

Young No Children, Treatment = Expected #


Bootstrapping: 100%|██████████| 1000/1000 [11:44<00:00,  1.42it/s]


(0.1744582248663289, array([-0.28371767,  0.69999481]))

## Ideal

In [12]:
ync_id_ps_estimator = IPWATEEstimator(propensity_model=model)

print("Young No Children, Treatment = Ideal #")
ync_id_ps_estimator.estimate_ATE(ync_data['X'], ync_data['Y'], ync_data['T2'])

Young No Children, Treatment = Ideal #


Bootstrapping: 100%|██████████| 1000/1000 [11:49<00:00,  1.41it/s]


(0.10983104747487843, array([-0.426882  ,  0.61794235]))