# MICS U5 â€” DoubleML (PLR + IRM + APOS) with Hyperparameter Tuning (Optuna)
This notebook loads `mics_u5.csv` and runs:
- **PLR** (Partial Linear Regression) with multiple treatments
- **IRM** (Interactive Regression Model) for binary treatments (one at a time)
- **APOS** (Average Potential Outcomes) for discrete multi-level treatments (e.g., `water_treatment3`)
- Optional **GATE**/**CATE** post-estimation

It is written to be **robust to high-dimensional one-hot covariates** and includes **Optuna-based tuning** using DoubleML's recommended APIs when available.


In [1]:

# --- Install compatible versions (recommended) ---
# NOTE: DoubleML (as of many stable releases) is not compatible with NumPy>=2.
# If you get import errors, run the following cell once, then restart the kernel.
# %pip install -U "numpy<2" "scikit-learn>=1.2" pandas optuna patsy doubleml


In [None]:

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# DoubleML + ML
import doubleml as dml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import optuna


## 1) Load data

In [None]:

DATA_PATH = "mics_u5.csv"  # expects the file in the current working directory OR update path

df_raw = pd.read_csv(DATA_PATH, low_memory=False)
print("Shape:", df_raw.shape)
print("Example columns:", list(df_raw.columns[:25]))


## 2) Choose outcome (Y), treatments (D), and features (X)
Defaults below are inferred from the dataset (you can edit them).

In [None]:

# --- Suggested defaults based on the uploaded dataset ---
y_col = "diarrhea"

# Binary treatments (IRM applicable)
d_cols_binary = [
    "water_treatment",
    "BetterWaterPostTreatment",
    "StrictlyBetterWaterPostTreatment"
]

# Multi-level discrete treatment (APOS applicable)
d_col_multilevel = "water_treatment3"   # levels observed: 0/1/2 in this dataset

# Optional: choose which treatments to include in PLR (can include multiple)
d_cols_plr = d_cols_binary  # you can add more columns here if you want a vector D

# Group variable for GATEs (must be mutually exclusive groups for clean interpretation)
group_col = "windex5"  # wealth quintile (no missing in this dataset)

# Sanity checks
for c in [y_col, *d_cols_binary, d_col_multilevel, group_col]:
    if c not in df_raw.columns:
        print(f"WARNING: column '{c}' not found.")

print("Y:", y_col)
print("Binary D for IRM:", d_cols_binary)
print("Multi-level D for APOS:", d_col_multilevel)
print("PLR D columns:", d_cols_plr)
print("Group column:", group_col)


## 3) Build modeling table: one-hot encode categoricals, impute missing
DoubleML expects numeric design matrices. We'll:
- Keep `Y` and `D` columns
- One-hot encode remaining categorical columns
- Median-impute numeric missing values


In [None]:

# Keep only the columns we need + all potential controls
# Here we use all other columns as controls by default (high-dimensional W/X).
keep_cols = list(dict.fromkeys([y_col] + d_cols_plr + [d_col_multilevel, group_col] + [c for c in df_raw.columns if c not in [y_col] + d_cols_plr + [d_col_multilevel, group_col]]))
df = df_raw[keep_cols].copy()

# Coerce Y and D columns to numeric where possible
df[y_col] = pd.to_numeric(df[y_col], errors="coerce")

for c in d_cols_plr:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df[d_col_multilevel] = pd.to_numeric(df[d_col_multilevel], errors="coerce")

# Build X from all remaining columns excluding y and the specific D columns you include in the model
exclude_for_x = set([y_col] + d_cols_plr + [d_col_multilevel])
x_cols_raw = [c for c in df.columns if c not in exclude_for_x]

# Separate X into categorical vs numeric
cat_cols = [c for c in x_cols_raw if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
num_cols = [c for c in x_cols_raw if c not in cat_cols]

# One-hot encode categoricals (including NaNs as a category)
X_cat = pd.get_dummies(df[cat_cols], dummy_na=True) if len(cat_cols) else pd.DataFrame(index=df.index)
X_num = df[num_cols].copy()

# Coerce numerics
for c in X_num.columns:
    X_num[c] = pd.to_numeric(X_num[c], errors="coerce")

# Combine
X = pd.concat([X_num, X_cat], axis=1)

# Impute missing values in X
imputer = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

print("Raw X cols:", len(x_cols_raw))
print("Expanded X cols after one-hot:", X_imp.shape[1])

# Final modeling dataframe for DoubleML: must include y, d, and x columns
df_model = pd.concat([df[[y_col] + d_cols_plr].reset_index(drop=True),
                      X_imp.reset_index(drop=True)],
                     axis=1)

# Drop rows with missing y or any missing D in the PLR D columns
mask = df_model[y_col].notna()
for c in d_cols_plr:
    mask &= df_model[c].notna()

df_model = df_model.loc[mask].reset_index(drop=True)

print("Final model shape:", df_model.shape)


## 4) Define learners (Random Forest) + Optuna hyperparameter spaces
We'll tune:
- `ml_l` (outcome nuisance) for PLR
- `ml_m` (treatment nuisance) for PLR
- `ml_g` (outcome nuisance) for IRM/APOS
- `ml_m` (propensity nuisance) for IRM/APOS


In [None]:

# Pipelines include imputers if you prefer (we already imputed X, but safe)
def make_rf_reg(params=None):
    params = params or {}
    return RandomForestRegressor(
        random_state=123,
        n_estimators=params.get("n_estimators", 200),
        max_depth=params.get("max_depth", None),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        n_jobs=-1,
    )

def make_rf_clf(params=None):
    params = params or {}
    return RandomForestClassifier(
        random_state=123,
        n_estimators=params.get("n_estimators", 200),
        max_depth=params.get("max_depth", None),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        n_jobs=-1,
    )

def ml_l_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    }

def ml_m_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    }

param_space_plr = {"ml_l": ml_l_params, "ml_m": ml_m_params}
param_space_irm = {"ml_g": ml_l_params, "ml_m": ml_m_params}  # same shapes; different roles

optuna_settings = {
    "n_trials": 30,
    "show_progress_bar": True,
    "verbosity": optuna.logging.WARNING
}


## 5) PLR: Partial Linear Model (multiple treatments)
This estimates a **partial linear causal parameter** (vector if multiple D).

In [None]:

# Build DoubleMLData for PLR
x_cols = list(X_imp.columns)  # the one-hot expanded features
dml_data_plr = dml.DoubleMLData(df_model, y_col=y_col, d_cols=d_cols_plr, x_cols=x_cols)

# Initial learners (will be tuned)
ml_l = make_rf_reg()
ml_m = make_rf_reg()  # for continuous-ish D; if D is binary you can also try a classifier, but PLR usually uses regression

dml_plr = dml.DoubleMLPLR(dml_data_plr, ml_l=ml_l, ml_m=ml_m, n_folds=5)

print(dml_plr)


### 5.1 Tune nuisance models (Optuna)
We try DoubleML's `tune_ml_models()` if available; otherwise we fallback to `tune()` (grid/random) as a safe alternative.

In [None]:

# --- Tuning helper ---
def tune_with_optuna(dml_obj, param_space, optuna_settings):
    if hasattr(dml_obj, "tune_ml_models"):
        dml_obj.tune_ml_models(ml_param_space=param_space, optuna_settings=optuna_settings)
        return "tune_ml_models"
    elif hasattr(dml_obj, "tune"):
        # Fallback: randomized search with a reasonable grid approximation
        # (If you want Optuna specifically, upgrade DoubleML to a version supporting tune_ml_models.)
        par_grids = {
            k: {
                "n_estimators": [100, 200, 300],
                "max_depth": [3, 5, 8, 12],
                "min_samples_leaf": [1, 2, 5, 10],
            }
            for k in param_space.keys()
        }
        dml_obj.tune(par_grids, search_mode="randomized_search", n_iter_randomized_search=20)
        return "tune(randomized_search)"
    else:
        return "no_tuning_api_found"

used = tune_with_optuna(dml_plr, param_space_plr, optuna_settings)
print("Tuning method used:", used)


### 5.2 Fit + inference

In [None]:

_ = dml_plr.fit()
print(dml_plr.summary)

print("\nPointwise CI:")
print(dml_plr.confint())

print("\nBootstrap + joint CI:")
_ = dml_plr.bootstrap()
print(dml_plr.confint(joint=True))

# Sensitivity analysis (choose cf_y, cf_d based on your domain)
dml_plr.sensitivity_analysis(cf_y=0.04, cf_d=0.03)
print(dml_plr.sensitivity_summary)


## 5.3 PLR GATEs and CATEs (optional)
- **GATE**: requires mutually exclusive groups (e.g., wealth quintile)
- **CATE**: provide a low-dimensional basis (e.g., splines on one variable)


In [None]:

# --- GATE example (wealth quintile) ---
if group_col in df_raw.columns:
    # Build groups for the filtered sample used in df_model
    # We must align indices: recompute group values for df_model rows (we kept original order)
    # We'll rebuild a small aligned series from the same mask used above.
    df_tmp = df_raw.loc[mask.values].reset_index(drop=True)  # align with df_model
    groups = pd.DataFrame(df_tmp[group_col].astype(str).values, columns=["Group"])
    gate_obj = dml_plr.gate(groups=groups)
    print("PLR-GATE CI:")
    print(gate_obj.confint())
else:
    print("Group col not found; skipping GATE.")


## 6) IRM: Interactive model (binary treatments only)
We fit one IRM per binary treatment and then (optionally) compute GATE/CATE.

In [None]:

# We'll reuse X_imp columns. Build a base dataframe with y + X only, and then swap in each binary D.
base_cols = [y_col] + x_cols
df_base = pd.concat([df_model[[y_col]].reset_index(drop=True), df_model[x_cols].reset_index(drop=True)], axis=1)

irm_results = {}

for d_col in d_cols_binary:
    if d_col not in df_raw.columns:
        print(f"Skipping {d_col}: not found.")
        continue

    # Build D aligned to df_model sample
    d_series = pd.to_numeric(df_raw.loc[mask.values, d_col], errors="coerce").reset_index(drop=True)

    # Check binary
    vals = sorted(pd.Series(d_series.dropna().unique()).tolist())
    if vals != [0.0, 1.0] and vals != [0, 1]:
        print(f"Skipping {d_col}: not binary (unique={vals[:10]}...).")
        continue

    df_irm = df_base.copy()
    df_irm[d_col] = d_series

    # Drop missing D
    df_irm = df_irm[df_irm[d_col].notna()].reset_index(drop=True)

    dml_data_irm = dml.DoubleMLData(df_irm, y_col=y_col, d_cols=d_col, x_cols=x_cols)

    ml_g = make_rf_reg()
    ml_m = make_rf_clf()

    dml_irm = dml.DoubleMLIRM(dml_data_irm, ml_g=ml_g, ml_m=ml_m, n_folds=5)

    used = tune_with_optuna(dml_irm, param_space_irm, optuna_settings)
    print(f"[{d_col}] tuning used:", used)

    _ = dml_irm.fit()
    irm_results[d_col] = dml_irm

    print(f"\n=== IRM results for {d_col} ===")
    print(dml_irm.summary)
    print(dml_irm.confint())

    _ = dml_irm.bootstrap()
    print("Joint CI:")
    print(dml_irm.confint(joint=True))

    dml_irm.sensitivity_analysis(cf_y=0.04, cf_d=0.03)
    print(dml_irm.sensitivity_summary)


### 6.1 IRM GATEs (optional)

In [None]:

if group_col in df_raw.columns:
    df_tmp = df_raw.loc[mask.values].reset_index(drop=True)
    groups_all = pd.DataFrame(df_tmp[group_col].astype(str).values, columns=["Group"])

    for d_col, dml_irm in irm_results.items():
        # align group rows to the IRM dataset (may have dropped missing D)
        d_series = pd.to_numeric(df_tmp[d_col], errors="coerce")
        keep = d_series.notna().values
        groups = groups_all.loc[keep].reset_index(drop=True)

        gate_obj = dml_irm.gate(groups=groups)
        print(f"\nIRM-GATE CI for {d_col}:")
        print(gate_obj.confint())
else:
    print("Group col not found; skipping IRM GATE.")


## 7) APOS: Average Potential Outcomes (discrete multi-level treatment)
Useful for treatments like `water_treatment3` with levels 0/1/2. It estimates average potential outcomes by level, then you can form contrasts.

In [None]:

if d_col_multilevel in df_raw.columns:
    # Build aligned dataset: y + D + X
    d_series = pd.to_numeric(df_raw.loc[mask.values, d_col_multilevel], errors="coerce").reset_index(drop=True)

    df_apos = df_base.copy()
    df_apos[d_col_multilevel] = d_series
    df_apos = df_apos[df_apos[d_col_multilevel].notna()].reset_index(drop=True)

    # Identify observed levels
    levels = sorted(pd.Series(df_apos[d_col_multilevel].unique()).tolist())
    print("Observed treatment levels:", levels)

    dml_data_apos = dml.DoubleMLData(df_apos, y_col=y_col, d_cols=d_col_multilevel, x_cols=x_cols)

    ml_g = make_rf_reg()
    ml_m = make_rf_clf()

    dml_apos = dml.DoubleMLAPOS(dml_data_apos, ml_g=ml_g, ml_m=ml_m, treatment_levels=levels, n_folds=5)

    used = tune_with_optuna(dml_apos, param_space_irm, optuna_settings)
    print("APOS tuning used:", used)

    _ = dml_apos.fit()
    print(dml_apos.summary)

    # Contrast vs reference level 0 if present
    ref = 0 if 0 in levels else levels[0]
    contrast = dml_apos.causal_contrast(reference_levels=ref)
    print("\nCausal contrast summary (ref =", ref, "):")
    print(contrast.summary)

else:
    print("Multi-level treatment column not found; skipping APOS.")


## 8) Diagnostics: Evaluate nuisance learners (optional but recommended for papers)

In [None]:

from sklearn.metrics import mean_absolute_error

def mae(y_true, y_pred):
    subset = np.logical_not(np.isnan(y_true))
    return mean_absolute_error(y_true[subset], y_pred[subset])

# Example: evaluate PLR learner for Y nuisance
try:
    eval_res = dml_plr.evaluate_learners(learners=["ml_l"], metric=mae)
    print(eval_res)
except Exception as e:
    print("evaluate_learners not available or failed:", e)
