# DoubleML

Exploration of household water risk using DoubleML on MICS data.

## EDA and preprocessing


In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from doubleml import DoubleMLData, DoubleMLPLR
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")


In [26]:
# Load raw data
mics = pd.read_csv("mics.csv", low_memory=False)
mics.head()


Unnamed: 0,HH1,HH2,HINT,HH3,HH4,HH5D,HH5M,HH5Y,HH6,HH7,...,RiskHome_0_12,RiskSource_0_12,water_treatment3,Any_U5,Region,windex_ur,windex5_categ,helevel_temp,wq27_decile,SomeRiskHome
0,1,5,12.0,12,11,2,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Poor,,7,1
1,1,14,15.0,15,11,3,6. JUNE,2017,2. Rural,1. EAST,...,1,0,0,1,1,2,Poor,,1,1
2,1,22,15.0,15,11,4,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
3,2,3,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
4,2,11,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,0,1,1,Poor,,8,1


In [48]:
# Keep only the columns used downstream
required_cols = [
    "windex_ur", "helevel", "country_cat", "urban",
    "WS1_g", "wq27_decile",
    "water_treatment", "VeryHighRiskHome", "SomeRiskHome",
]

mics = mics[required_cols].copy()
mics[required_cols].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54340 entries, 0 to 54339
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   windex_ur         54340 non-null  int64 
 1   windex5           54340 non-null  object
 2   helevel           54340 non-null  object
 3   country_cat       54340 non-null  object
 4   urban             54340 non-null  object
 5   WS1_g             54340 non-null  object
 6   wq27_decile       54340 non-null  int64 
 7   water_treatment   54340 non-null  int64 
 8   VeryHighRiskHome  54340 non-null  int64 
 9   SomeRiskHome      54340 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 4.1+ MB


In [49]:
# Map string categories to numeric codes for model consumption
HE_LEVEL = {
    "No education": 0,
    "Primary": 1,
    "Secondary or higher": 2,
}

URBAN = {
    "Rural": 0,
    "Urban": 1,
}

RISK_SOURCE = {
    "No risk": 0,
    "Moderate to high risk": 1,
    "Very high risk": 2,
}

mics["helevel"] = mics["helevel"].map(HE_LEVEL)
mics["urban"] = mics["urban"].map(URBAN)
# mics["RiskSource"] = mics["RiskSource"].map(RISK_SOURCE)


In [50]:
"""Encode categorical variables.
- WQ15_g: one-hot with reference level dropped.
- windex5: ordinal to preserve welfare ordering.
- country_cat, WS1_g, water_treatment: one-hot with reference.
Other columns pass through unchanged.
"""
wq15_categories = [[
    "Treat: Nothing",
    "Treat: Strain/Settle",
    "Treat: Chlorine/Aquatabs/PUR",
    "Treat: Boil",
    "Treat: Other",
]]

windex5_cat = [[
    "Poorest",
    "Poor",
    "Middle",
    "Rich",
    "Richest",
]]

cat_default = ["country_cat", "WS1_g", "water_treatment"]
# cat_wq15 = ["WQ15_g"]
ord_windex5 = ["windex5"]

ct = ColumnTransformer(
    [
        # (
        #     "wq15",
        #     OneHotEncoder(
        #         categories=wq15_categories,
        #         drop="first",
        #         sparse_output=False,
        #         handle_unknown="ignore",
        #     ),
        #     cat_wq15,
        # ),
        (
            "windex5",
            OrdinalEncoder(categories=windex5_cat),
            ord_windex5,
        ),
        (
            "other_cat",
            OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"),
            cat_default,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

ct.set_output(transform="pandas")
mics = ct.fit_transform(mics)
mics.head()


Unnamed: 0,windex5,country_cat_Benin,country_cat_Central African Republic,country_cat_Chad,country_cat_DR Congo,country_cat_Dominican Republic,country_cat_Eswatini,country_cat_Fiji,country_cat_Gambia,country_cat_Ghana,...,WS1_g_Surface/Rain water,WS1_g_Tube/Well/Borehole,WS1_g_Unprotected well/spring,water_treatment_1,windex_ur,helevel,urban,wq27_decile,VeryHighRiskHome,SomeRiskHome
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2,0,0,7,0,1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2,0,0,1,0,1
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2,0,0,8,1,1
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,2,0,0,8,1,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1,0,0,8,0,1


# Binary treatment
## Outcome: VeryHighRiskHome


In [64]:
len([col for col in mics.columns if col.startswith('country')])

23

In [52]:
# Define outcome, treatment, and controls
binary_y = "VeryHighRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "SomeRiskHome"] + binary_d]

# Build DoubleML data object
binary_data_vhr = DoubleMLData(
    data=mics,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

# Base learners
ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

# Double machine learning model
binary_model_vhr = DoubleMLPLR(
    binary_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [54]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_vhr.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x259c46020d0>

In [55]:
# Fit and summarize
binary_model_vhr.fit()
binary_model_vhr.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
water_treatment_1,-0.06886,0.005593,-12.312232,7.784021999999999e-35,-0.079821,-0.057898


In [57]:
RISK ={
    0:"No risk",
    1:"Moderate to high risk",
    2:"Very high risk",
}

# Convert the mapped Series to a DataFrame
groups_df = mics['RiskSource'].map(RISK).to_frame(name='Group')

binary_model_vhr.gate(groups=groups_df).summary

KeyError: 'RiskSource'

# Binary treatment
## Outcome: SomeRiskHome


In [57]:
# Define outcome, treatment, and controls for the alternative outcome
binary_y = "SomeRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "VeryHighRiskHome"] + binary_d]

binary_data_some = DoubleMLData(
    data=mics,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

binary_model_some = DoubleMLPLR(
    binary_data_some,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [58]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_some.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x1c245310b90>

In [59]:
# Fit and summarize the alternative outcome
binary_model_some.fit()
binary_model_some.summary


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
water_treatment_1,-0.142838,0.100803,-1.417004,0.156482,-0.340408,0.054732


# Multinomial treatment
## Outcome: VeryHighRiskHome


In [60]:
# Define multinomial treatment columns (one column per category)
multi_y = "VeryHighRiskHome"
multi_d = [col for col in mics.columns if col.startswith("WS1_g_")]
# Drop the binary treatment from controls to avoid duplication
multi_x = [col for col in mics.columns if col not in [multi_y, "SomeRiskHome"] + multi_d + ["water_treatment_1"]]

multi_data_vhr = DoubleMLData(
    data=mics,
    y_col=multi_y,
    d_cols=multi_d,
    x_cols=multi_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=len(multi_d),
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

multi_model_vhr = DoubleMLPLR(
    multi_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [61]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

multi_model_vhr.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x1c2453134d0>

In [62]:
# Fit and summarize
multi_model_vhr.fit()
multi_model_vhr.summary


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
WS1_g_Packaged/Bottled water,-0.012406,0.018894,-0.656631,0.511418,-0.049437,0.024625
WS1_g_Piped water,-0.003689,0.016403,-0.224902,0.822055,-0.035839,0.028461
WS1_g_Protected well/spring,-0.013311,0.018089,-0.735865,0.461813,-0.048764,0.022142
WS1_g_Surface/Rain water,0.005045,0.018329,0.275226,0.783143,-0.03088,0.040969
WS1_g_Tube/Well/Borehole,-0.004125,0.014595,-0.28262,0.777468,-0.03273,0.024481
WS1_g_Unprotected well/spring,0.001761,0.022672,0.077689,0.938075,-0.042676,0.046199


# Multinomial treatment
## Outcome: SomeRiskHome


In [63]:
multi_y = "SomeRiskHome"
multi_d = [col for col in mics.columns if col.startswith("WS1_g_")]
multi_x = [col for col in mics.columns if col not in [multi_y, "VeryHighRiskHome"] + multi_d + ["water_treatment_1"]]

multi_data_some = DoubleMLData(
    data=mics,
    y_col=multi_y,
    d_cols=multi_d,
    x_cols=multi_x,
)

ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=len(multi_d),
    eta=0.1,
    n_estimators=34,
    n_jobs=-1,
)

multi_model_some = DoubleMLPLR(
    multi_data_some,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [None]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

multi_model_some.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Fit and summarize
multi_model_some.fit()
multi_model_some.summary


# VeryHighRisk Subsample RiskSource==0

In [None]:
# Define outcome, treatment, and controls
binary_y = "VeryHighRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "SomeRiskHome"] + binary_d]

mics_sub0 = mics[mics['RiskSource'] == 0].copy()

# Build DoubleML data object
binary_data_vhr = DoubleMLData(
    data=mics_sub0,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

# Base learners
ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

# Double machine learning model
binary_model_vhr = DoubleMLPLR(
    binary_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [None]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_vhr.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

In [25]:
# Fit and summarize
binary_model_vhr.fit()
binary_model_vhr.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
water_treatment_1,-0.4518,0.246419,-1.833466,0.066733,-0.934772,0.031172


In [28]:
mics_sub0['VeryHighRiskHome'].mean()

np.float64(0.12873174747606048)

# VeryHighRisk Subsample RiskSource==2

In [23]:
# Define outcome, treatment, and controls
binary_y = "VeryHighRiskHome"
binary_d = ["water_treatment_1"]
binary_x = [col for col in mics.columns if col not in [binary_y, "SomeRiskHome"] + binary_d]

mics_sub0 = mics[mics['RiskSource'] == 2].copy()

# Build DoubleML data object
binary_data_vhr = DoubleMLData(
    data=mics_sub0,
    y_col=binary_y,
    d_cols=binary_d,
    x_cols=binary_x,
)

# Base learners
ml_l_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

ml_m_xgb = XGBClassifier(
    use_label_encoder=False,
    objective="binary:logistic",
    eval_metric="logloss",
    eta=0.1,
    n_estimators=34,
)

# Double machine learning model
binary_model_vhr = DoubleMLPLR(
    binary_data_vhr,
    ml_l=ml_l_xgb,
    ml_m=ml_m_xgb,
)


In [None]:
# Hyperparameter search with Optuna (keeps the same space as original)
def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }


param_space = {"ml_l": ml_l_params, "ml_m": ml_m_params}
optuna_settings = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

binary_model_vhr.tune_ml_models(
    ml_param_space=param_space,
    optuna_settings=optuna_settings,
)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Fit and summarize
binary_model_vhr.fit()
binary_model_vhr.summary