# DoubleML

Exploration of household water risk using DoubleML on MICS data.

## EDA and preprocessing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLIRM
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load raw data
mics = pd.read_csv("mics.csv", low_memory=False)
mics.head()


Unnamed: 0,HH1,HH2,HINT,HH3,HH4,HH5D,HH5M,HH5Y,HH6,HH7,...,RiskHome_0_12,RiskSource_0_12,water_treatment3,Any_U5,Region,windex_ur,windex5_categ,helevel_temp,wq27_decile,SomeRiskHome
0,1,5,12.0,12,11,2,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Poor,,7,1
1,1,14,15.0,15,11,3,6. JUNE,2017,2. Rural,1. EAST,...,1,0,0,1,1,2,Poor,,1,1
2,1,22,15.0,15,11,4,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
3,2,3,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
4,2,11,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,0,1,1,Poor,,8,1


In [3]:
# Feature groups for preprocessing
bin_vars = [
    "Any_U5", "Girls_less_than15", "Boys_15or_less", "urban",
    "ImprovedWaterSource", "PipedWater", "WellandSpringWater",
    "RainandSurfaceWater", "PurchasedWater",
    "Basic_water_service", "Limited_water_service", "Unimproved_water_service",
    "improved_latrine", "Flush", "Pit_latrine", "Open_defecation",
    "Basic_sanitation_service", "Limited_sanitation_service",
    "Unimproved_sani_service",
    "SoapandWater", "Basic_hygiene_facility",
    "Limited_hygiene_facility", "No_hygeine_service_facility",
    "rainy_season"
]

ord_vars = ["helevel", "water_carrier_edu"]
cat_vars = ["Region", "country_cat", "WS1"]

# Optional missing-value handling (define num_vars before using it):
# mics[num_vars] = mics[num_vars].fillna(mics[num_vars].median())
# mics[bin_vars] = mics[bin_vars].fillna(0)
# mics[ord_vars + cat_vars] = mics[ord_vars + cat_vars].fillna("Missing")


In [4]:
# Encode education as an ordinal feature
helevel_map = {
    "No education": 0,
    "Primary": 1,
    "Secondary or higher": 2,
}

mics["helevel_ord"] = (
    mics["helevel"]
    .map(helevel_map)
    .fillna(-1)  # explicit missing category
    .astype(int)
)

mics = mics.drop(columns=["helevel"])


In [5]:
# Clean and encode water carrier education
mics["water_carrier_edu"] = (
    mics["water_carrier_edu"]
    .replace([98, 99], pd.NA)
)

mics["water_carrier_edu_ord"] = (
    mics["water_carrier_edu"]
    .fillna(-1)  # explicit missing category
    .astype(int)
)

mics = mics.drop(columns=["water_carrier_edu"])


In [6]:
# Encode household wealth quintile as ordinal
windex5_map = {
    "Poorest": 0,
    "Poor": 1,
    "Middle": 2,
    "Rich": 3,
    "Richest": 4,
}

mics["windex5_ord"] = (
    mics["windex5"]
    .map(windex5_map)
    .fillna(-1)  # keep missing explicit
    .astype(int)
)

mics = mics.drop(columns=["windex5"])


In [7]:
# Binary urban indicator
mics["urban_bin"] = (
    mics["urban"]
    .map({"Urban": 1, "Rural": 0})
    .fillna(0)
    .astype(int)
)

mics = mics.drop(columns=["urban"])


In [8]:
# One-hot encode Region and keep the rest as-is
ct = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(
                drop="first",
                handle_unknown="ignore",
                sparse_output=False
            ),
            ["Region"]
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

ct.set_output(transform="pandas")

X_cols = [
    # Demographics
    "Any_U5", "Girls_less_than15", "Boys_15or_less",

    # Education (already ordinal)
    "helevel_ord", "water_carrier_edu_ord",

    # Socioeconomic status
    "windex5_ord", "urban_bin",

    # Water access
    "ImprovedWaterSource", "PipedWater", "WellandSpringWater",
    "RainandSurfaceWater", "PurchasedWater",
    "Basic_water_service", "Limited_water_service", "Unimproved_water_service",

    # Sanitation
    "improved_latrine", "Flush", "Pit_latrine", "Open_defecation",
    "Basic_sanitation_service", "Limited_sanitation_service",
    "Unimproved_sani_service",

    # Hygiene
    # "SoapandWater", "Basic_hygiene_facility",
    # "Limited_hygiene_facility", "No_hygeine_service_facility",

    # Context
    "rainy_season", "Region",
]

X = ct.fit_transform(mics[X_cols])


# Partial Linear Model
## Outcome: SomeRiskHome


In [9]:
# Final DoubleML DataFrame for PLM (align X with treatment/outcome)
dml_df_vhr_plm = X.copy()
dml_df_vhr_plm["water_treatment"] = mics.loc[X.index, "water_treatment"].values
dml_df_vhr_plm["SomeRiskHome"] = mics.loc[X.index, "SomeRiskHome"].values


In [10]:
dml_data_vhr_plm = DoubleMLData(
    data=dml_df_vhr_plm.dropna(),
    y_col="SomeRiskHome",
    d_cols="water_treatment",
    x_cols=X.columns.tolist(),
)

ml_l_xgb_plm = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
)

ml_m_xgb_plm = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
)

plm_model_vhr = DoubleMLPLR(
    dml_data_vhr_plm,
    ml_l=ml_l_xgb_plm,
    ml_m=ml_m_xgb_plm,
)


In [11]:
# Hyperparameter search with Optuna (keeps the same space as original)
def plm_ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 20),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
    }


def plm_ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400, step=50),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 30),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
    }


param_space_plm = {"ml_l": plm_ml_l_params, "ml_m": plm_ml_m_params}
optuna_settings_plm = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

plm_model_vhr.tune_ml_models(
    ml_param_space=param_space_plm,
    optuna_settings=optuna_settings_plm,
)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<doubleml.plm.plr.DoubleMLPLR at 0x2362ecf67b0>

In [None]:
# Fit and summarize
plm_model_vhr.fit()
plm_model_vhr.summary


# Interactive Regression Model
## Outcome: SomeRiskHome


In [None]:
# Final DoubleML DataFrame for IRM (align X with treatment/outcome)
dml_df_vhr_irm = X.copy()
dml_df_vhr_irm["water_treatment"] = mics.loc[X.index, "water_treatment"].values
dml_df_vhr_irm["SomeRiskHome"] = mics.loc[X.index, "SomeRiskHome"].values


In [None]:
dml_data_vhr_irm = DoubleMLData(
    data=dml_df_vhr_irm.dropna(),
    y_col="SomeRiskHome",
    d_cols="water_treatment",
    x_cols=X.columns.tolist(),
)

ml_g_xgb_irm = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
)

ml_m_xgb_irm = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
)

irm_model_vhr = DoubleMLIRM(
    dml_data_vhr_irm,
    ml_g=ml_g_xgb_irm,
    ml_m=ml_m_xgb_irm,
)


In [None]:
# Hyperparameter search with Optuna (keeps the same space as original)
def irm_ml_g_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 20),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
    }


def irm_ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400, step=50),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 30),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
    }


param_space_irm = {"ml_g": irm_ml_g_params, "ml_m": irm_ml_m_params}
optuna_settings_irm = {
    "n_jobs_optuna" : -1, 
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING,
}

irm_model_vhr.tune_ml_models(
    ml_param_space=param_space_irm,
    optuna_settings=optuna_settings_irm,
)


In [None]:
# Fit and summarize
irm_model_vhr.fit()
irm_model_vhr.summary


# Results Table

In [None]:
# Combine results for a single comparison table
plm_summary = plm_model_vhr.summary
irm_summary = irm_model_vhr.summary

results_table = (
    pd.concat(
        [
            plm_summary.assign(model="PLM"),
            irm_summary.assign(model="IRM"),
        ],
        ignore_index=True,
    )
)

results_table
