# Deconfounding using Do Why

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import graphviz
import dowhy
from IPython.display import Image, display

In [3]:
# Create your dag
dot = graphviz.Digraph() 

# Create nodes
dot.node('cluster', 'cluster')  
dot.node('streaming_pre', 'streaming_pre')
dot.node('is_treatment', 'is_treatment')
dot.node('streaming_post', 'streaming_post')

# Align the nodes
dot.edge('cluster', 'streaming_post')
dot.edge('cluster', 'is_treatment')
dot.edge('streaming_pre', 'streaming_post')
dot.edge('streaming_pre', 'is_treatment')
dot.edge('is_treatment', 'streaming_post')

# Print your nodes
print(dot.source)

# Save your dag
dot.format = 'png'
dot.render(directory='doctest-output', view = True).replace('\\', '/')

digraph {
	cluster [label=cluster]
	streaming_pre [label=streaming_pre]
	is_treatment [label=is_treatment]
	streaming_post [label=streaming_post]
	cluster -> streaming_post
	cluster -> is_treatment
	streaming_pre -> streaming_post
	streaming_pre -> is_treatment
	is_treatment -> streaming_post
}



'doctest-output/Digraph.gv.png'

## Create data

In [5]:
# Generate Treatment Data
tr_cluster_n = [7000, 13000, 25000, 16000, 1000, 1000]
treatment = pd.DataFrame( 
  [(np.random.normal(14), np.random.normal(17)) for x in range(tr_cluster_n[0])] +
  [(np.random.normal(17), np.random.normal(19)) for x in range(tr_cluster_n[1])] +
  [(np.random.normal(19), np.random.normal(21)) for x in range(tr_cluster_n[2])] +
  [(np.random.normal(16), np.random.normal(18)) for x in range(tr_cluster_n[3])] +
  [(np.random.normal(11), np.random.normal(14)) for x in range(tr_cluster_n[4])] +
  [(np.random.normal(12), np.random.normal(15)) for x in range(tr_cluster_n[5])])

treatment['cluster'] = np.concatenate([np.repeat(i, x) for i,x in enumerate(tr_cluster_n)])
treatment.columns = ['streaming_pre', 'streaming_post', 'cluster']

treatment['is_treatment'] = 1

# Generate Control group Data
ctr_cluster_n = [690, 2130, 7800, 7700, 1200, 100]
control = pd.DataFrame(
  [(np.random.normal(12), np.random.normal(13)) for x in range(ctr_cluster_n[0])] +
 [(np.random.normal(16), np.random.normal(16)) for x in range(ctr_cluster_n[1])] +
 [(np.random.normal(20), np.random.normal(20)) for x in range(ctr_cluster_n[2])] +
 [(np.random.normal(14), np.random.normal(15)) for x in range(ctr_cluster_n[3])] +
 [(np.random.normal(10), np.random.normal(9)) for x in range(ctr_cluster_n[4])] +
 [(np.random.normal(12), np.random.normal(12)) for x in range(ctr_cluster_n[5])])

control['cluster'] = np.concatenate([np.repeat(i, x) for i,x in enumerate(ctr_cluster_n)])
control.columns = ['streaming_pre', 'streaming_post', 'cluster']
control['is_treatment'] = 0

# Treatment and Control in single dataframe
df = pd.concat([treatment, control])
df.head()

Unnamed: 0,streaming_pre,streaming_post,cluster,is_treatment
0,13.714523,16.122251,0,1
1,14.36003,17.433372,0,1
2,16.103542,16.850052,0,1
3,14.266474,16.310738,0,1
4,14.011724,15.348387,0,1


In [7]:
print(df.groupby('is_treatment')['streaming_post'].mean())
print(df.groupby('is_treatment')['streaming_pre'].mean())

is_treatment
0    16.643007
1    19.172541
Name: streaming_post, dtype: float64
is_treatment
0    16.274409
1    17.028754
Name: streaming_pre, dtype: float64


In [8]:
# Initializing causal model
model = dowhy.CausalModel(data=df,
                     graph=dot.source.replace("\t", ' ').replace("\n", ' '),
                     treatment="is_treatment",
                     outcome="streaming_post")

- The dag helps to identify the estimand which is what we need to control for
- dowhy will allow us to use any tehcnique to estimate the causal effect - here we choose propensity score stratification using average treatment effect

In [9]:
# Estimate
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

In [10]:
# Estimate Causal Effect with propensity score stratifications
estimate = model.estimate_effect(identified_estimand, 
                                 method_name="backdoor.propensity_score_stratification",
                                target_units="att")
print(f"Estimated average treatment effect on the treated {estimate.value}")

propensity_score_stratification


  y = column_or_1d(y, warn=True)


Estimated average treatment effect on the treated 1.8494783632434122


1.84 