# DoubleML

Exploration of household water risk using DoubleML on MICS data.

## EDA and preprocessing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from doubleml import DoubleMLData, DoubleMLPLR
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load raw data
mics = pd.read_csv("mics.csv", low_memory=False)
mics.head()


Unnamed: 0,HH1,HH2,HINT,HH3,HH4,HH5D,HH5M,HH5Y,HH6,HH7,...,NoRiskHome_01_2,RiskHome_0_12,RiskSource_0_12,water_treatment3,Any_U5,Region,windex_ur,windex5_categ,wq27_decile,SomeRiskHome
0,1,5,12.0,12,11,2,6. JUNE,2017,2. Rural,1. EAST,...,1,1,1,0,1,1,2,Poor,7,1
1,1,14,15.0,15,11,3,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,0,1,1,2,Poor,1,1
2,1,22,15.0,15,11,4,6. JUNE,2017,2. Rural,1. EAST,...,0,1,1,0,1,1,2,Middle,8,1
3,2,3,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,0,1,1,0,1,1,2,Middle,8,1
4,2,11,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,1,0,0,1,1,Poor,8,1


In [3]:
# Keep only the columns used downstream
required_cols = [
    "windex_ur", "windex5", "helevel", "country_cat", "urban",
    "WS1_g", "wq27_decile", "WQ15_g", "RiskSource",
    "water_treatment", "VeryHighRiskHome", "SomeRiskHome",
]

mics = mics[required_cols].copy()
mics[required_cols].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54340 entries, 0 to 54339
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   windex_ur         54340 non-null  int64 
 1   windex5           54340 non-null  object
 2   helevel           54340 non-null  object
 3   country_cat       54340 non-null  object
 4   urban             54340 non-null  object
 5   WS1_g             54340 non-null  object
 6   wq27_decile       54340 non-null  int64 
 7   WQ15_g            54340 non-null  object
 8   RiskSource        54340 non-null  object
 9   water_treatment   54340 non-null  int64 
 10  VeryHighRiskHome  54340 non-null  int64 
 11  SomeRiskHome      54340 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 5.0+ MB


In [4]:
# Map string categories to numeric codes for model consumption
HE_LEVEL = {
    "No education": 0,
    "Primary": 1,
    "Secondary or higher": 2,
}

URBAN = {
    "Rural": 0,
    "Urban": 1,
}

RISK_SOURCE = {
    "No risk": 0,
    "Moderate to high risk": 1,
    "Very high risk": 2,
}

mics["helevel"] = mics["helevel"].map(HE_LEVEL)
mics["urban"] = mics["urban"].map(URBAN)
mics["RiskSource"] = mics["RiskSource"].map(RISK_SOURCE)


In [6]:
"""Encode categorical variables.
- WQ15_g: one-hot with reference level dropped.
- windex5: ordinal to preserve welfare ordering.
- country_cat, WS1_g, water_treatment: one-hot with reference.
Other columns pass through unchanged.
"""
wq15_categories = [[
    "Treat: Nothing",
    "Treat: Strain/Settle",
    "Treat: Chlorine/Aquatabs/PUR",
    "Treat: Boil",
    "Treat: Other",
]]

windex5_cat = [[
    "Poorest",
    "Poor",
    "Middle",
    "Rich",
    "Richest",
]]

cat_default = ["country_cat", "WS1_g", "water_treatment"]
cat_wq15 = ["WQ15_g"]
ord_windex5 = ["windex5"]

ct = ColumnTransformer(
    [
        (
            "wq15",
            OneHotEncoder(
                categories=wq15_categories,
                drop="first",
                sparse_output=False,
                handle_unknown="ignore",
            ),
            cat_wq15,
        ),
        (
            "windex5",
            OrdinalEncoder(categories=windex5_cat),
            ord_windex5,
        ),
        (
            "other_cat",
            OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"),
            cat_default,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

ct.set_output(transform="pandas")
mics = ct.fit_transform(mics)
mics.head()


Unnamed: 0,WQ15_g_Treat: Strain/Settle,WQ15_g_Treat: Chlorine/Aquatabs/PUR,WQ15_g_Treat: Boil,WQ15_g_Treat: Other,windex5,country_cat_Benin,country_cat_Central African Republic,country_cat_Chad,country_cat_DR Congo,country_cat_Dominican Republic,...,WS1_g_Tube/Well/Borehole,WS1_g_Unprotected well/spring,water_treatment_1,windex_ur,helevel,urban,wq27_decile,RiskSource,VeryHighRiskHome,SomeRiskHome
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,7,1,0,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,1,0,0,1
2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,8,2,1,1
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2,0,0,8,2,1,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1,0,0,8,1,0,1


# Binary treatment
## Outcome: VeryHighRiskHome


In [7]:
# Define outcome, treatment, and controls
binary_y = "VeryHighRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "SomeRiskHome"] + binary_d]

# Build DoubleML data object
binary_data_vhr = DoubleMLData(
    data=mics,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

# Base learners
ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

# Double machine learning model
binary_model_vhr = DoubleMLPLR(
    binary_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [8]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }


def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_trials": 100,
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_vhr.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x2201570d160>

In [9]:
# Fit and summarize
binary_model_vhr.fit()
binary_model_vhr.summary


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
water_treatment_1,-0.248442,0.148278,-1.675508,0.093835,-0.539062,0.042179


In [10]:
# Group-wise treatment effects (GATE)
groups = pd.DataFrame({
    "Education level": mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"}),
})
binary_model_vhr.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_No education,0.035164,0.014908,2.358706,0.01833877,0.005945,0.064384
Group_Primary,-0.464279,0.204036,-2.275477,0.0228773,-0.864181,-0.064376
Group_Secondary or higher,-0.100706,0.015403,-6.538222,6.22544e-11,-0.130895,-0.070518


In [11]:
# Group-wise treatment effects by area
groups = pd.DataFrame({
    "Area": mics["urban"].map({0: "Rural", 1: "Urban"}),
})
binary_model_vhr.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_Rural,-0.074243,0.070411,-1.05442,0.291691,-0.212245,0.06376
Group_Urban,-0.422782,0.22816,-1.85301,0.063881,-0.869967,0.024403


In [12]:
# Group-wise treatment effects by wealth index
groups = pd.DataFrame({
    "Wealth Index": mics["windex5"].map({
        0: "Poorest",
        1: "Poor",
        2: "Middle",
        3: "Rich",
        4: "Richest",
    }),
})
binary_model_vhr.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_Middle,-0.748184,0.01165,-64.222019,0.0,-0.771018,-0.725351
Group_Poor,0.020821,0.014088,1.477948,0.1394218,-0.00679,0.048432
Group_Poorest,3.038094,11.077811,0.27425,0.7838922,-18.674016,24.750204
Group_Rich,-0.166478,0.010998,-15.137308,9.189958e-52,-0.188033,-0.144922
Group_Richest,-0.104436,0.01113,-9.383241,6.39756e-21,-0.12625,-0.082621


# Binary treatment
## Outcome: SomeRiskHome


In [13]:
# Define outcome, treatment, and controls for the alternative outcome
binary_y = "SomeRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "VeryHighRiskHome"] + binary_d]

binary_data_some = DoubleMLData(
    data=mics,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

binary_model_some = DoubleMLPLR(
    binary_data_some,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [14]:
# Hyperparameter search for the alternative outcome

def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }


def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_trials": 100,
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_some.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x220155342d0>

In [15]:
# Fit and summarize the alternative outcome
binary_model_some.fit()
binary_model_some.summary


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
water_treatment_1,-0.139592,0.105387,-1.324558,0.185318,-0.346148,0.066964


In [16]:
# Group-wise treatment effects (GATE) for the alternative outcome
groups = pd.DataFrame({
    "Education level": mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"}),
})
binary_model_some.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_No education,0.000242,0.009657,0.025018,0.9800409,-0.018686,0.019169
Group_Primary,-0.093097,0.159811,-0.582546,0.5601987,-0.40632,0.220126
Group_Secondary or higher,-0.372114,0.020548,-18.109089,2.701827e-73,-0.412388,-0.33184


In [17]:
# Group-wise treatment effects by area for the alternative outcome
groups = pd.DataFrame({
    "Area": mics["urban"].map({0: "Rural", 1: "Urban"}),
})
binary_model_some.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_Rural,0.05987,0.050783,1.178948,0.238419,-0.039662,0.159402
Group_Urban,-0.339227,0.020104,-16.874011,6.988543e-64,-0.378629,-0.299825


In [18]:
# Group-wise treatment effects by wealth index for the alternative outcome
groups = pd.DataFrame({
    "Wealth Index": mics["windex5"].map({
        0: "Poorest",
        1: "Poor",
        2: "Middle",
        3: "Rich",
        4: "Richest",
    }),
})
binary_model_some.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_Middle,-0.32122,0.015149,-21.203906,8.789054999999999e-100,-0.350912,-0.291529
Group_Poor,-0.011082,0.016848,-0.657728,0.5107129,-0.044104,0.021941
Group_Poorest,1.55346,9.366168,0.165859,0.8682682,-16.803893,19.910812
Group_Rich,0.131654,0.014652,8.98536,2.578892e-19,0.102937,0.160372
Group_Richest,-0.360039,0.012002,-29.997618,1.054111e-197,-0.383563,-0.336515


In [19]:
# Group-wise treatment effects by education + area + wealth for the alternative outcome
groups = pd.DataFrame({
    "Edu_Area": (
        mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"})
        + " | "
        + mics["urban"].map({0: "Rural", 1: "Urban"})
        + " | "
        + mics["windex5"].map({
            0: "Poorest",
            1: "Poor",
            2: "Middle",
            3: "Rich",
            4: "Richest",
        })
    )
})
binary_model_some.gate(groups=groups).summary


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group_No education | Rural | Middle,0.059931,27.65298,0.002167,0.9982708,-54.138914,54.258775
Group_No education | Rural | Poor,-0.000273,0.004666,-0.058472,0.9533724,-0.009418,0.008872
Group_No education | Rural | Poorest,10.122289,13.580495,0.745355,0.4560572,-16.494993,36.73957
Group_No education | Rural | Rich,-24.772336,38.151129,-0.649321,0.5161308,-99.547175,50.002504
Group_No education | Rural | Richest,-11.648724,71.62368,-0.162638,0.8708036,-152.028558,128.73111
Group_No education | Urban | Middle,-43.942685,39.58295,-1.110142,0.266938,-121.523842,33.638471
Group_No education | Urban | Poor,15.165182,52.45167,0.289127,0.7724844,-87.638203,117.968567
Group_No education | Urban | Poorest,32.844291,35.022613,0.937802,0.348346,-35.798769,101.487351
Group_No education | Urban | Rich,12.375643,38.459307,0.321785,0.7476153,-63.003214,87.7545
Group_No education | Urban | Richest,-23.390634,52.405117,-0.446343,0.6553498,-126.102775,79.321507


# Multinomial treatment
## Outcome: VeryHighRiskHome


In [20]:
# Define multinomial treatment columns (one column per category)
multi_y = "VeryHighRiskHome"
multi_d = [col for col in mics.columns if col.startswith("WS1_g_")]
# Drop the binary treatment from controls to avoid duplication
multi_x = [col for col in mics.columns if col not in [multi_y, "SomeRiskHome"] + multi_d + ["water_treatment_1"]]

multi_data_vhr = DoubleMLData(
    data=mics,
    y_col=multi_y,
    d_cols=multi_d,
    x_cols=multi_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=len(multi_d),
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

multi_model_vhr = DoubleMLPLR(
    multi_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [21]:
# Optional hyperparameter search (commented to save time)
# def ml_l_params(trial):
#     return {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#     }
#
#
# def ml_m_params(trial):
#     return {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#     }
#
# param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
# optuna_settings = {
#     "n_trials": 100,
#     "show_progress_bar": True,
#     "verbosity": optuna.logging.WARNING,
# }
#
# multi_model_vhr.tune_ml_models(
#     ml_param_space=param_space,
#     optuna_settings=optuna_settings,
# )


In [22]:
# Fit and summarize
multi_model_vhr.fit()
multi_model_vhr.summary


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
WS1_g_Packaged/Bottled water,-0.07018,0.015489,-4.530985,5.870924e-06,-0.100538,-0.039823
WS1_g_Piped water,-0.005642,0.0137,-0.411848,0.6804507,-0.032493,0.021209
WS1_g_Protected well/spring,-0.006038,0.015199,-0.397259,0.6911763,-0.035827,0.023751
WS1_g_Surface/Rain water,0.026649,0.015521,1.717033,0.08597313,-0.00377,0.057069
WS1_g_Tube/Well/Borehole,0.097758,0.01359,7.193605,6.31026e-13,0.071123,0.124392
WS1_g_Unprotected well/spring,0.036852,0.017154,2.148323,0.03168814,0.003231,0.070473


In [23]:
# Group-wise treatment effects (GATE) for multinomial treatment
groups = pd.DataFrame({
    "Education level": mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"}),
})
multi_model_vhr.gate(groups=groups).summary


NotImplementedError: Only implemented for single treatment. Number of treatments is 6.

In [None]:
# Group-wise treatment effects by area
groups = pd.DataFrame({
    "Area": mics["urban"].map({0: "Rural", 1: "Urban"}),
})
multi_model_vhr.gate(groups=groups).summary


In [None]:
# Group-wise treatment effects by wealth index
groups = pd.DataFrame({
    "Wealth Index": mics["windex5"].map({
        0: "Poorest",
        1: "Poor",
        2: "Middle",
        3: "Rich",
        4: "Richest",
    }),
})
multi_model_vhr.gate(groups=groups).summary


In [None]:
# Group-wise treatment effects by education + area + wealth
groups = pd.DataFrame({
    "Edu_Area": (
        mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"})
        + " | "
        + mics["urban"].map({0: "Rural", 1: "Urban"})
        + " | "
        + mics["windex5"].map({
            0: "Poorest",
            1: "Poor",
            2: "Middle",
            3: "Rich",
            4: "Richest",
        })
    )
})
multi_model_vhr.gate(groups=groups).summary


# Multinomial treatment
## Outcome: SomeRiskHome


In [None]:
multi_y = "SomeRiskHome"
multi_d = [col for col in mics.columns if col.startswith("WS1_g_")]
multi_x = [col for col in mics.columns if col not in [multi_y, "VeryHighRiskHome"] + multi_d + ["water_treatment_1"]]

multi_data_some = DoubleMLData(
    data=mics,
    y_col=multi_y,
    d_cols=multi_d,
    x_cols=multi_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=len(multi_d),
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

multi_model_some = DoubleMLPLR(
    multi_data_some,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [None]:
# Optional hyperparameter search (commented to save time)
# def ml_l_params(trial):
#     return {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#     }
#
#
# def ml_m_params(trial):
#     return {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#     }
#
# param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
# optuna_settings = {
#     "n_trials": 100,
#     "show_progress_bar": True,
#     "verbosity": optuna.logging.WARNING,
# }
#
# multi_model_some.tune_ml_models(
#     ml_param_space=param_space,
#     optuna_settings=optuna_settings,
# )


In [None]:
# Fit and summarize
multi_model_some.fit()
multi_model_some.summary


In [None]:
# Group-wise treatment effects (GATE) for multinomial treatment
groups = pd.DataFrame({
    "Education level": mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"}),
})
multi_model_some.gate(groups=groups).summary


In [None]:
# Group-wise treatment effects by area
groups = pd.DataFrame({
    "Area": mics["urban"].map({0: "Rural", 1: "Urban"}),
})
multi_model_some.gate(groups=groups).summary


In [None]:
# Group-wise treatment effects by wealth index
groups = pd.DataFrame({
    "Wealth Index": mics["windex5"].map({
        0: "Poorest",
        1: "Poor",
        2: "Middle",
        3: "Rich",
        4: "Richest",
    }),
})
multi_model_some.gate(groups=groups).summary


In [None]:
# Group-wise treatment effects by education + area + wealth
groups = pd.DataFrame({
    "Edu_Area": (
        mics["helevel"].map({0: "No education", 1: "Primary", 2: "Secondary or higher"})
        + " | "
        + mics["urban"].map({0: "Rural", 1: "Urban"})
        + " | "
        + mics["windex5"].map({
            0: "Poorest",
            1: "Poor",
            2: "Middle",
            3: "Rich",
            4: "Richest",
        })
    )
})
multi_model_some.gate(groups=groups).summary
