In [1]:
import numpy as np
import pandas as pd
import doubleml as dml
from doubleml.datasets import fetch_401K

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns
import causalml
import random

In [2]:
data = fetch_401K(return_type='DataFrame')

# Step 1

Split the dataset into two equal groups ensuring that e401 is equally distributed

In [3]:
random.seed(1234)
df1 = data.sample(frac = 0.5)
df2 = data.drop(df1.index)

print('df1: \n', df1['e401'].value_counts())
print('df2: \n', df2['e401'].value_counts())

df1: 
 0    3131
1    1827
Name: e401, dtype: int64
df2: 
 0    3102
1    1855
Name: e401, dtype: int64


# Step 2

Get subdataframes separated by eligibility

In [4]:
df1_0 = df1.loc[df1['e401'] == 0]
df1_1 = df1.loc[df1['e401'] == 1]
df2_0 = df2.loc[df2['e401'] == 0]
df2_1 = df2.loc[df2['e401'] == 1]


In [5]:
for i in [df1_0, df1_1, df2_0, df2_1]:
    print(i.isna().any())

nifa       False
net_tfa    False
tw         False
age        False
inc        False
fsize      False
educ       False
db         False
marr       False
twoearn    False
e401       False
p401       False
pira       False
hown       False
dtype: bool
nifa       False
net_tfa    False
tw         False
age        False
inc        False
fsize      False
educ       False
db         False
marr       False
twoearn    False
e401       False
p401       False
pira       False
hown       False
dtype: bool
nifa       False
net_tfa    False
tw         False
age        False
inc        False
fsize      False
educ       False
db         False
marr       False
twoearn    False
e401       False
p401       False
pira       False
hown       False
dtype: bool
nifa       False
net_tfa    False
tw         False
age        False
inc        False
fsize      False
educ       False
db         False
marr       False
twoearn    False
e401       False
p401       False
pira       False
hown       False
dtype: bool


# Step 3

Train two models, one for D1_1, and another for D1_0, f1 and f0, respectively

## Initialize DoubleML backend

In [6]:
# Set up basic model: Specify variables for data-backend
features_base = ['age', 'inc', 'educ', 'fsize', 'marr',
                 'twoearn', 'db', 'pira', 'hown']

# Initialize DoubleMLData (data-backend of DoubleML)
data_dml_base = dml.DoubleMLData(data,
                                 y_col='net_tfa',
                                 d_cols='e401',
                                 x_cols=features_base)

## df1_0 model

In [9]:
# Set up a model according to regression formula with polynomials
features = df1_0.copy()[['marr', 'twoearn', 'db', 'pira', 'hown']]

poly_dict = {'age': 2,
             'inc': 2,
             'educ': 2,
             'fsize': 2}
for key, degree in poly_dict.items():
    poly = PolynomialFeatures(degree, include_bias=False)
    data_transf = poly.fit_transform(df1_0[[key]])
    x_cols = poly.get_feature_names_out([key])
    data_transf = pd.DataFrame(data_transf, columns=x_cols)

    features = pd.concat((features, data_transf),
                          axis=1, sort=False)

df1_0_model_data = pd.concat((df1_0.copy()[['net_tfa', 'e401']], features.copy()),
                        axis=1, sort=False)


In [8]:
features.isna().any()

marr       False
twoearn    False
db         False
pira       False
hown       False
dtype: bool

## df1_1 model