In [149]:
!pip install dowhy -q 

In [150]:
import numpy as np
import pandas as pd 
from dowhy import CausalModel
import dowhy.datasets

### Input

In [151]:
data = dowhy.datasets.linear_dataset(beta=10,
        num_common_causes=5,
        num_instruments = 2,
        num_effect_modifiers=1,
        num_samples=5000,
        treatment_is_binary=True,
        stddev_treatment_noise=10,
        num_discrete_common_causes=1)
df = data["df"]

In [152]:
df.head()

Unnamed: 0,X0,Z0,Z1,W0,W1,W2,W3,W4,v0,y
0,1.659205,1.0,0.412551,-1.271891,-1.213866,-3.36324,0.74001,0,True,-7.727757
1,1.927444,1.0,0.158883,0.471406,-0.419258,0.772836,-1.505698,2,True,14.957155
2,-0.905056,0.0,0.273616,0.398389,-0.125861,-0.269796,-0.022341,2,True,8.907052
3,0.498213,1.0,0.402183,-0.257929,2.354469,-0.668974,0.746608,0,True,16.32374
4,0.22371,1.0,0.34434,-1.016008,1.401469,-1.217189,-0.23682,3,True,7.886028


In [153]:
df = df.rename(columns={
    'v0': 'takes_vitamins_daily',
    'y': 'energy_level',
    'W0': 'workout_freq',
    'W1': 'income',
    'W2': 'fruit_consumption',
    'W3': 'sleep_hours',
    'W4': 'extra_confounder',  # Rename as needed
    'Z0': 'has_vitamins_home',
    'Z1': 'has_vitamin_prescription',
    'X0': 'age_group'
})

In [154]:
df['energy_level_norm'] = (df['energy_level'] - df['energy_level'].min()) / (df['energy_level'].max() - df['energy_level'].min())

### Causal Model

In [155]:
from dowhy import CausalModel

model = CausalModel(
    data=df,
    treatment='takes_vitamins_daily',
    outcome='energy_level_norm',
    common_causes=[
        'workout_freq', 'income', 'fruit_consumption', 'sleep_hours', 'extra_confounder'
    ],
    instruments=[
        'has_vitamins_home', 'has_vitamin_prescription'
    ],
    effect_modifiers=['age_group']
)

In [156]:
df

Unnamed: 0,age_group,has_vitamins_home,has_vitamin_prescription,workout_freq,income,fruit_consumption,sleep_hours,extra_confounder,takes_vitamins_daily,energy_level,energy_level_norm
0,1.659205,1.0,0.412551,-1.271891,-1.213866,-3.363240,0.740010,0,True,-7.727757,0.303595
1,1.927444,1.0,0.158883,0.471406,-0.419258,0.772836,-1.505698,2,True,14.957155,0.633222
2,-0.905056,0.0,0.273616,0.398389,-0.125861,-0.269796,-0.022341,2,True,8.907052,0.545310
3,0.498213,1.0,0.402183,-0.257929,2.354469,-0.668974,0.746608,0,True,16.323740,0.653079
4,0.223710,1.0,0.344340,-1.016008,1.401469,-1.217189,-0.236820,3,True,7.886028,0.530474
...,...,...,...,...,...,...,...,...,...,...,...
4995,2.364309,1.0,0.892639,0.747743,-1.917427,-1.364247,0.568051,3,True,9.318811,0.551293
4996,0.070210,1.0,0.998595,-0.943788,1.453228,-1.589848,-0.005528,0,True,3.699993,0.469648
4997,0.448175,0.0,0.904819,-0.893321,-0.408418,-2.329442,0.757352,3,True,1.192763,0.433216
4998,1.006945,1.0,0.204989,-0.163910,1.371780,0.280800,0.954270,3,True,23.717957,0.760522


### Estimatands

In [157]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

### Estimand : 2
Estimand name: iv
Estimand expression:
 ⎡                                                                             ↪
 ⎢                      d                                         ⎛           

### Causal Estimates - Backdoor

In [158]:
causal_estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification")
print(causal_estimate)

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: ate

## Estimate
Mean v

### Results & Interpretation

📈 Result

• Causal Estimate: 0.1783

• This means that taking vitamins daily increases normalized energy level by ~0.178 units on average, holding the confounders constant.

• Outcome scale: Since you normalized energy_level (e.g., between 0 and 1), the effect is now interpreted on that scale.

📌 Interpretation

energy_level_norm was scaled from 0 to 1:

Taking vitamins daily is associated with an average 18.6% increase in normalized energy level, controlling for confounders.

🧪 Assumptions

The estimate relies on the backdoor criterion, which assumes:

• All confounders that affect both the treatment (takes_vitamins_daily) and the outcome (energy_level) are observed and included (like workout, sleep, etc.).

• There are no unobserved confounders (i.e., the assumption of unconfoundedness holds).

⚠️ Important Caveats

• This does not mean that everyone who takes vitamins will feel exactly 0.18 units more energized. It’s an average effect across the population.

• If some confounders are unobserved or measured poorly, the estimate could be biased.


✅ Conclusion

This is a statistically and practically interpretable causal estimate. If your model assumptions hold (especially unconfoundedness), then you can interpret this as a causal effect, not just correlation.

### Control Group

📖 What ATC means:
	•	ATC (Average Treatment effect on the Controls) answers the question:
“What would have happened to the control group (people who didn’t take vitamins) if they had taken the treatment (i.e. started taking vitamins)?”
	•	You’re essentially applying the treatment effect to the untreated group to see how much their outcome would have improved if they had been treated.

In [159]:
# Causal effect on the control group (ATC)
causal_estimate_att = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification",
        target_units = "atc")
print(causal_estimate_att)
print("Causal Estimate is " + str(causal_estimate_att.value))

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: atc

## Estimate
Mean v

In [160]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

In [161]:
estimate = model.estimate_effect(identified_estimand,
                                 method_name="backdoor.propensity_score_stratification")
print(estimate)
print("Causal Estimate is " + str(estimate.value))

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: ate

## Estimate
Mean v

### Refutation

In [162]:
res_random=model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", show_progress_bar=True)
print(res_random)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:10<00:00,  9.40it/s]

Refute: Add a random common cause
Estimated effect:0.17830193852983767
New effect:0.1783019385298377
p value:1.0






In [163]:
res_placebo=model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter", show_progress_bar=True, placebo_type="permute")
print(res_placebo)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:09<00:00, 10.29it/s]

Refute: Use a Placebo Treatment
Estimated effect:0.17830193852983767
New effect:0.0006697727678838146
p value:0.8400000000000001






In [164]:
res_subset=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter", show_progress_bar=True, subset_fraction=0.9)
print(res_subset)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:09<00:00, 10.40it/s]

Refute: Use a subset of data
Estimated effect:0.17830193852983767
New effect:0.17743032411115078
p value:0.78






In [165]:
res_subset=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter", show_progress_bar=True, subset_fraction=0.9, random_seed = 1, n_jobs=-1, verbose=10)
print(res_subset)

Refuting Estimates:   0%|[32m          [0m| 0/100 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.4s
Refuting Estimates:  24%|[32m██▍       [0m| 24/100 [00:03<00:10,  7.09it/s][Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.4s
Refuting Estimates:  36%|[32m███▌      [0m| 36/100 [00:03<00:05, 11.05it/s][Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.7s
Refuting Estimates:  48%|[32m████▊     [0m| 48/100 [00:03<00:03, 16.62it/s][Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    3.9s
Refuting Estimates:  60%|[32m██████    [0m| 60/100 [00:03<00:01, 22.42it/s][Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.0s
Refuting Estimates:  72%|[32m███████▏  [0m| 72/100 [00:04<00:00, 30.19it/s][Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    4.2s
Refuting Estimates: 100%|[32m██████

Refute: Use a subset of data
Estimated effect:0.17830193852983767
New effect:0.17772513261873757
p value:0.76



[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.7s finished


📌 What It Means:

• High p-values (>0.05) from refutation tests suggest that the causal estimate did not change significantly when noise, subset sampling, placebo treatment, or other perturbations were introduced.

• This indicates that your estimate is stable and not easily altered by small changes in data or assumptions.

✅ You Can Say:

Since the p-values of all refutations are above 0.05, we fail to reject the null hypothesis that the estimated causal effect remains unchanged under perturbations.

Therefore, the causal estimate of 0.18 (or an 18% increase in normalized energy level from daily vitamin intake) is robust to refutations and likely reliable, assuming model assumptions hold.

🔍 Optional Refinement

• Refutations don’t prove that the estimate is “true,” but they increase confidence that it’s not an artifact of bias or model instability.

• It’s still crucial that your confounders are correctly specified and there’s no hidden confounding.