In [50]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from dowhy import CausalModel
from IPython.display import Image, display

DoWhy is a Python Library that sparks causal thinking and analysis via 4-steps:
1. Model a causal inference problem using assumptions that we create.
2. Identify an expression for the causal effect under these assumptions (“causal estimand”).
3. Estimate the expression using statistical methods such as matching or instrumental variables.
4. Verify the validity of the estimate using a variety of robustness checks.

If we make it more simple, the way DoWhy package done Causal Analysis is by Creating Causal Model -> Identify Effect -> Estimate the Effect -> Validate.

In [51]:
class Causal_Model:
    def __init__(self,df:pd.DataFrame,treatment:str,outcome:str)->None:
        self.df=df
        self.treatment=treatment
        self.outcome=outcome

    def dowhy_model(self):
        labelencoder = LabelEncoder()
        self.df[self.outcome] = labelencoder.fit_transform(self.df[self.outcome])
        cols=list(self.df.columns)
        if self.treatment in cols and self.outcome in cols :
            xs = self.df.drop([self.treatment,self.outcome], axis = 1)
        xs=xs.columns.tolist()
        model=CausalModel(
            data= self.df,
            treatment=self.treatment,
            outcome=self.outcome,
            common_causes=xs
        )
        model.view_model()
        estimands = model.identify_effect() 
        print(estimands)
        estimate = model.estimate_effect(estimands,method_name = "backdoor.propensity_score_weighting")
        print(estimate)
        refutel1 = model.refute_estimate(estimands,estimate, "random_common_cause")
        print(refutel1)
        refutel2 = model.refute_estimate(estimands,estimate, "data_subset_refuter")
        print(refutel2)
        refutel3 = model.refute_estimate(estimands,estimate, "placebo_treatment_refuter")
        print(refutel3)
        return estimands
    
    



In [52]:
data = pd.read_csv('../data/Univariate_feature_selection_best_5_features.csv')


In [53]:
labelencoder = LabelEncoder()
data['diagnosis'] = labelencoder.fit_transform(data['diagnosis'])
data

Unnamed: 0,diagnosis,area_mean,area_se,texture_mean,concavity_worst,concavity_mean
0,1,1001.0,153.40,10.38,0.7119,0.30010
1,1,1326.0,74.08,17.77,0.2416,0.08690
2,1,1203.0,94.03,21.25,0.4504,0.19740
3,1,386.1,27.23,20.38,0.6869,0.24140
4,1,1297.0,94.44,14.34,0.4000,0.19800
...,...,...,...,...,...,...
564,1,1479.0,158.70,22.39,0.4107,0.24390
565,1,1261.0,99.04,28.25,0.3215,0.14400
566,1,858.1,48.55,28.08,0.3403,0.09251
567,1,1265.0,86.22,29.33,0.9387,0.35140


# Hypothesis  
benign cells(no cancer) tend to have less concavity than malignant cells(cancer).
does high concaity cause diagnosis ?

In [54]:
data['High_concavity'] = data['concavity_mean'].apply(lambda x: True if x > 0.04558 else False)
data

Unnamed: 0,diagnosis,area_mean,area_se,texture_mean,concavity_worst,concavity_mean,High_concavity
0,1,1001.0,153.40,10.38,0.7119,0.30010,True
1,1,1326.0,74.08,17.77,0.2416,0.08690,True
2,1,1203.0,94.03,21.25,0.4504,0.19740,True
3,1,386.1,27.23,20.38,0.6869,0.24140,True
4,1,1297.0,94.44,14.34,0.4000,0.19800,True
...,...,...,...,...,...,...,...
564,1,1479.0,158.70,22.39,0.4107,0.24390,True
565,1,1261.0,99.04,28.25,0.3215,0.14400,True
566,1,858.1,48.55,28.08,0.3403,0.09251,True
567,1,1265.0,86.22,29.33,0.9387,0.35140,True


In [55]:
c=Causal_Model(data,'High_concavity','diagnosis')

In [56]:

c.dowhy_model()

Estimand type: nonparametric-ate

### Estimand : 1
Estimand name: backdoor
Estimand expression:
        d                                                                     
─────────────────(Expectation(diagnosis|area_mean,concavity_mean,concavity_wor
d[High_concavity]                                                             

                         
st,area_se,texture_mean))
                         
Estimand assumption 1, Unconfoundedness: If U→{High_concavity} and U→diagnosis then P(diagnosis|High_concavity,area_mean,concavity_mean,concavity_worst,area_se,texture_mean,U) = P(diagnosis|High_concavity,area_mean,concavity_mean,concavity_worst,area_se,texture_mean)

### Estimand : 2
Estimand name: iv
No such variable found!

### Estimand : 3
Estimand name: frontdoor
No such variable found!

*** Causal Estimate ***

## Identified estimand
Estimand type: nonparametric-ate

### Estimand : 1
Estimand name: backdoor
Estimand expression:
        d                                       

<dowhy.causal_identifier.IdentifiedEstimand at 0x25a53d38be0>