In [132]:
!pip install dowhy -q 

In [133]:
import numpy as np
import pandas as pd 
from dowhy import CausalModel
import dowhy.datasets

In [134]:
data = dowhy.datasets.linear_dataset(beta=10,
        num_common_causes=5,
        num_instruments = 2,
        num_effect_modifiers=1,
        num_samples=5000,
        treatment_is_binary=True,
        stddev_treatment_noise=10,
        num_discrete_common_causes=1)
df = data["df"]

In [135]:
df.head()

Unnamed: 0,X0,Z0,Z1,W0,W1,W2,W3,W4,v0,y
0,0.802715,1.0,0.63977,1.769283,0.67407,-0.33257,1.737682,1,True,21.003234
1,-1.049829,1.0,0.324844,1.271551,-0.159837,-0.043812,1.200456,3,True,18.659245
2,1.114656,1.0,0.939806,1.317983,1.349325,-3.113106,1.324723,1,True,20.67878
3,-1.789981,0.0,0.496011,-0.989227,0.032433,1.103229,0.277274,1,True,9.833704
4,-3.420701,0.0,0.859505,1.213444,-0.338042,-0.235136,0.42251,0,True,11.829053


In [136]:
df = df.rename(columns={
    'v0': 'takes_vitamins_daily',
    'y': 'energy_level',
    'W0': 'workout_freq',
    'W1': 'income',
    'W2': 'fruit_consumption',
    'W3': 'sleep_hours',
    'W4': 'extra_confounder',  # Rename as needed
    'Z0': 'has_vitamins_home',
    'Z1': 'has_vitamin_prescription',
    'X0': 'age_group'
})

In [137]:
df['energy_level_norm'] = (df['energy_level'] - df['energy_level'].min()) / (df['energy_level'].max() - df['energy_level'].min())

In [138]:
from dowhy import CausalModel

model = CausalModel(
    data=df,
    treatment='takes_vitamins_daily',
    outcome='energy_level_norm',
    common_causes=[
        'workout_freq', 'income', 'fruit_consumption', 'sleep_hours', 'extra_confounder'
    ],
    instruments=[
        'has_vitamins_home', 'has_vitamin_prescription'
    ],
    effect_modifiers=['age_group']
)

In [139]:
df

Unnamed: 0,age_group,has_vitamins_home,has_vitamin_prescription,workout_freq,income,fruit_consumption,sleep_hours,extra_confounder,takes_vitamins_daily,energy_level,energy_level_norm
0,0.802715,1.0,0.639770,1.769283,0.674070,-0.332570,1.737682,1,True,21.003234,0.852185
1,-1.049829,1.0,0.324844,1.271551,-0.159837,-0.043812,1.200456,3,True,18.659245,0.805385
2,1.114656,1.0,0.939806,1.317983,1.349325,-3.113106,1.324723,1,True,20.678780,0.845707
3,-1.789981,0.0,0.496011,-0.989227,0.032433,1.103229,0.277274,1,True,9.833704,0.629176
4,-3.420701,0.0,0.859505,1.213444,-0.338042,-0.235136,0.422510,0,True,11.829053,0.669015
...,...,...,...,...,...,...,...,...,...,...,...
4995,0.245879,1.0,0.449950,-1.856968,0.379914,-1.239307,0.025397,0,True,5.513993,0.542929
4996,0.886262,1.0,0.333640,-0.050924,0.650056,-1.358493,-1.256007,0,False,1.274409,0.458282
4997,0.252013,1.0,0.156331,-0.317469,-1.068730,-1.056044,2.176843,1,True,6.072371,0.554078
4998,-0.411429,1.0,0.092972,-1.454388,-1.462539,-0.047081,-0.503066,3,True,3.320306,0.499130


In [140]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

### Estimand : 2
Estimand name: iv
Estimand expression:
 ⎡                                                                             ↪
 ⎢                      d                                         ⎛           

In [141]:
causal_estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification")
print(causal_estimate)

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: ate

## Estimate
Mean v

📈 Result

• Causal Estimate: 0.1858

• This means that taking vitamins daily increases normalized energy level by ~0.186 units on average, holding the confounders constant.

• Outcome scale: Since you normalized energy_level (e.g., between 0 and 1), the effect is now interpreted on that scale.

📌 Interpretation

energy_level_norm was scaled from 0 to 1:

Taking vitamins daily is associated with an average 18.6% increase in normalized energy level, controlling for confounders.

🧪 Assumptions

The estimate relies on the backdoor criterion, which assumes:

• All confounders that affect both the treatment (takes_vitamins_daily) and the outcome (energy_level) are observed and included (like workout, sleep, etc.).

• There are no unobserved confounders (i.e., the assumption of unconfoundedness holds).

⚠️ Important Caveats

• This does not mean that everyone who takes vitamins will feel exactly 0.18 units more energized. It’s an average effect across the population.

• If some confounders are unobserved or measured poorly, the estimate could be biased.


✅ Conclusion

This is a statistically and practically interpretable causal estimate. If your model assumptions hold (especially unconfoundedness), then you can interpret this as a causal effect, not just correlation.

In [142]:
# Causal effect on the control group (ATC)
causal_estimate_att = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification",
        target_units = "atc")
print(causal_estimate_att)
print("Causal Estimate is " + str(causal_estimate_att.value))

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: atc

## Estimate
Mean v

In [143]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

In [144]:
estimate = model.estimate_effect(identified_estimand,
                                 method_name="backdoor.propensity_score_stratification")
print(estimate)
print("Causal Estimate is " + str(estimate.value))

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
           d                                                                   ↪
───────────────────────(E[energy_level_norm|workout_freq,sleep_hours,fruit_con ↪
d[takes_vitamins_daily]                                                        ↪

↪                                   
↪ sumption,income,extra_confounder])
↪                                   
Estimand assumption 1, Unconfoundedness: If U→{takes_vitamins_daily} and U→energy_level_norm then P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder,U) = P(energy_level_norm|takes_vitamins_daily,workout_freq,sleep_hours,fruit_consumption,income,extra_confounder)

## Realized estimand
b: energy_level_norm~takes_vitamins_daily+workout_freq+sleep_hours+fruit_consumption+income+extra_confounder
Target units: ate

## Estimate
Mean v

In [145]:
res_random=model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", show_progress_bar=True)
print(res_random)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:09<00:00, 11.07it/s]

Refute: Add a random common cause
Estimated effect:0.20129513887070638
New effect:0.2012951388707064
p value:1.0






In [146]:
res_placebo=model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter", show_progress_bar=True, placebo_type="permute")
print(res_placebo)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:09<00:00, 11.07it/s]

Refute: Use a Placebo Treatment
Estimated effect:0.20129513887070638
New effect:-0.0009530361284196393
p value:0.88






In [147]:
res_subset=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter", show_progress_bar=True, subset_fraction=0.9)
print(res_subset)

Refuting Estimates: 100%|[32m██████████[0m| 100/100 [00:08<00:00, 11.35it/s]

Refute: Use a subset of data
Estimated effect:0.20129513887070638
New effect:0.20226608486128775
p value:0.6200000000000001






In [148]:
res_subset=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter", show_progress_bar=True, subset_fraction=0.9, random_seed = 1, n_jobs=-1, verbose=10)
print(res_subset)

Refuting Estimates:   0%|[32m          [0m| 0/100 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
Refuting Estimates:  12%|[32m█▏        [0m| 12/100 [00:00<00:00, 118.63it/s][Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.7s
Refuting Estimates:  24%|[32m██▍       [0m| 24/100 [00:03<00:13,  5.54it/s] [Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.7s
Refuting Estimates:  36%|[32m███▌      [0m| 36/100 [00:03<00:06,  9.42it/s][Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.0s
Refuting Estimates:  48%|[32m████▊     [0m| 48/100 [00:04<00:03, 14.49it/s][Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.2s
Refuting Estimates:  60%|[32m██████    [0m| 60/100 [00:04<00:01, 20.73it/s][Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.3s
Refuting Estimates:  72%|[32m███████▏  [0m| 72/100 [00:04<00:00, 28.58it/s][Parallel(n_jobs=-1

Refute: Use a subset of data
Estimated effect:0.20129513887070638
New effect:0.2022884323438183
p value:0.52



[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.8s finished


📌 What It Means:

• High p-values (>0.05) from refutation tests suggest that the causal estimate did not change significantly when noise, subset sampling, placebo treatment, or other perturbations were introduced.

• This indicates that your estimate is stable and not easily altered by small changes in data or assumptions.

✅ You Can Say:

Since the p-values of all refutations are above 0.05, we fail to reject the null hypothesis that the estimated causal effect remains unchanged under perturbations.

Therefore, the causal estimate of 0.186 (or an 18.6% increase in normalized energy level from daily vitamin intake) is robust to refutations and likely reliable, assuming model assumptions hold.

🔍 Optional Refinement

You could also mention:

• Refutations don’t prove that the estimate is “true,” but they increase confidence that it’s not an artifact of bias or model instability.

• It’s still crucial that your confounders are correctly specified and there’s no hidden confounding.