# DoubleML

## EDA

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import optuna

import warnings
warnings.filterwarnings('ignore')

In [2]:
mics = pd.read_csv('mics.csv', low_memory=False)

In [3]:
mics.head()

Unnamed: 0,HH1,HH2,HINT,HH3,HH4,HH5D,HH5M,HH5Y,HH6,HH7,...,NoRiskHome_01_2,RiskHome_0_12,RiskSource_0_12,water_treatment3,Any_U5,Region,windex_ur,windex5_categ,wq27_decile,SomeRiskHome
0,1,5,12.0,12,11,2,6. JUNE,2017,2. Rural,1. EAST,...,1,1,1,0,1,1,2,Poor,7,1
1,1,14,15.0,15,11,3,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,0,1,1,2,Poor,1,1
2,1,22,15.0,15,11,4,6. JUNE,2017,2. Rural,1. EAST,...,0,1,1,0,1,1,2,Middle,8,1
3,2,3,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,0,1,1,0,1,1,2,Middle,8,1
4,2,11,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,1,0,0,1,1,Poor,8,1


In [4]:
required_cols =  ['windex_ur', 'windex5', 'helevel', 'country_cat', 'urban', 'WS1_g', 'wq27_decile', 'WQ15_g', 'RiskSource', 'water_treatment', 'VeryHighRiskHome', 'SomeRiskHome']
mics = mics[required_cols]

In [5]:
mics[required_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54340 entries, 0 to 54339
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   windex_ur         54340 non-null  int64 
 1   windex5           54340 non-null  object
 2   helevel           54340 non-null  object
 3   country_cat       54340 non-null  object
 4   urban             54340 non-null  object
 5   WS1_g             54340 non-null  object
 6   wq27_decile       54340 non-null  int64 
 7   WQ15_g            54340 non-null  object
 8   RiskSource        54340 non-null  object
 9   water_treatment   54340 non-null  int64 
 10  VeryHighRiskHome  54340 non-null  int64 
 11  SomeRiskHome      54340 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 5.0+ MB


In [6]:
for col in required_cols:
    print(mics[col].value_counts())
    print('='*40, '\n')

windex_ur
2    29337
1    25003
Name: count, dtype: int64

windex5
Poorest    14039
Poor       11411
Middle     10555
Rich        9773
Richest     8562
Name: count, dtype: int64

helevel
Secondary or higher    23698
Primary                17225
No education           13417
Name: count, dtype: int64

country_cat
Bangladesh                  6045
Honduras                    3670
Benin                       3632
Madagascar                  3239
Ghana                       3130
Malawi                      3090
Lao                         2984
Viet Nam                    2856
DR Congo                    2706
Dominican Republic          2513
Chad                        2110
Zimbabwe                    1991
Guinea Bissau               1812
Gambia                      1744
Sierra Leone                1741
Mongolia                    1631
Trinidad and Tobago         1583
Lesotho                     1329
Guyana                      1324
Eswatini                    1156
Fiji                       

In [7]:
helevel = {
    'No education' : 0,
    'Primary' : 1,
    'Secondary or higher' : 2
}

urban = {
    'Rural' : 0,
    'Urban' : 1
}

RiskSource = {
    'No risk' : 0,
    'Moderate to high risk' : 1, 
    'Very high risk' : 2
}

mics['helevel']=mics['helevel'].map(helevel)
mics['urban']=mics['urban'].map(urban)
mics['RiskSource']=mics['RiskSource'].map(RiskSource)

In [8]:
mics['windex5'].unique()

array(['Poor', 'Middle', 'Poorest', 'Rich', 'Richest'], dtype=object)

In [9]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

wq15_categories = [[
    'Treat: Nothing',
    'Treat: Strain/Settle',
    'Treat: Chlorine/Aquatabs/PUR',
    'Treat: Boil',
    'Treat: Other'
]]

windex5_cat = [[
    'Poorest',
    'Poor', 
    'Middle',
    'Rich', 
    'Richest'
]]

cat_default = ['country_cat', 'WS1_g', 'water_treatment']

cat_wq15 = ['WQ15_g']
ord_windex5 = ['windex5']


ct = ColumnTransformer(
    [
        ('wq15', OneHotEncoder(
            categories=wq15_categories,
            drop='first',
            sparse_output=False,
            handle_unknown='ignore'
        ), cat_wq15),

        ('windex5', OrdinalEncoder(
            categories=windex5_cat
        ), ord_windex5),

        ('other_cat', OneHotEncoder(
            drop='first',
            sparse_output=False,
            handle_unknown='ignore'
        ), cat_default),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

ct.set_output(transform="pandas")
mics = ct.fit_transform(mics)


In [10]:
mics.head()

Unnamed: 0,WQ15_g_Treat: Strain/Settle,WQ15_g_Treat: Chlorine/Aquatabs/PUR,WQ15_g_Treat: Boil,WQ15_g_Treat: Other,windex5,country_cat_Benin,country_cat_Central African Republic,country_cat_Chad,country_cat_DR Congo,country_cat_Dominican Republic,...,WS1_g_Tube/Well/Borehole,WS1_g_Unprotected well/spring,water_treatment_1,windex_ur,helevel,urban,wq27_decile,RiskSource,VeryHighRiskHome,SomeRiskHome
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,7,1,0,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,1,0,0,1
2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,0,0,8,2,1,1
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2,0,0,8,2,1,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1,0,0,8,1,0,1


# Binary Treatment

## Y = VeryHighRiskHome

In [11]:
y1 = ['VeryHighRiskHome'] 
y2 = ['SomeRiskHome']
d_cols = ['water_treatment_1']
exclude = y1 + y2 + d_cols
x_cols = [col for col in mics.columns if col not in exclude]

In [None]:
from doubleml import DoubleMLData

dml_data1 = DoubleMLData(data=mics, 
                         y_col='VeryHighRiskHome', 
                         d_cols=d_cols,
                         x_cols= x_cols
                        )



KeyboardInterrupt



In [None]:
from xgboost import XGBClassifier, XGBRegressor

ml_l_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34)

ml_m_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34)

In [None]:
from doubleml import DoubleMLPLR

dml_plr_tree = DoubleMLPLR(dml_data1,
                            ml_l = ml_l_xgb,
                            ml_m = ml_m_xgb)

In [None]:
def ml_l_params(trial):
     return {
         'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
         'max_depth': trial.suggest_int('max_depth', 3, 10),
         'min_child_weight': trial.suggest_int('min_child_weight', 1, 20) 
     }
 

def ml_m_params(trial):
     return {
         'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
         'max_depth': trial.suggest_int('max_depth', 3, 10),
         'min_child_weight': trial.suggest_int('min_child_weight', 1, 20) 
     }

param_space = {
     'ml_l': ml_l_params,
     'ml_m': ml_m_params
 }
 

optuna_settings = {
    'n_trials': 100,
    'show_progress_bar': True,
    'verbosity': optuna.logging.WARNING,  # Suppress Optuna logs
 }
 

dml_plr_tree.tune_ml_models(ml_param_space=param_space,
                             optuna_settings=optuna_settings,
 )
 

In [None]:
dml_plr_tree.fit()

dml_plr_tree.summary

In [None]:
groups = pd.DataFrame({
    'Education level': mics['helevel'].map({
        0: 'No education',
        1: 'Primary',
        2: 'Secondary or higher'
    })
})

gate_he = dml_plr_tree.gate(groups=groups)
gate_he.summary


In [None]:
groups = pd.DataFrame({
    'Area': mics['urban'].map({
        0: 'Rural',
        1: 'Urban'
    })
})

gate_urban = dml_plr_tree.gate(groups=groups)
gate_urban.summary

In [None]:
groups = pd.DataFrame({
    'Wealth Index': mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
})
gate_windex = dml_plr_tree.gate(groups=groups)
gate_windex.summary

In [None]:
groups = pd.DataFrame({
    'Edu_Area': (
        mics['helevel'].map({
            0: 'No education',
            1: 'Primary',
            2: 'Secondary or higher'
        })
        + ' | '
        + mics['urban'].map({
            0: 'Rural',
            1: 'Urban'
        })
        + ' | '
        + mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
    )
})

gate_both = dml_plr_tree.gate(groups=groups)
gate_both.summary


## Y = SomeRiskHome

In [None]:
from doubleml import DoubleMLData

dml_data2 = DoubleMLData(data=mics, 
                         y_col='SomeRiskHome', 
                         d_cols=d_cols,
                         x_cols= x_cols
                        )


In [None]:
from xgboost import XGBClassifier, XGBRegressor

ml_l_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34)

ml_m_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34)

In [None]:
from doubleml import DoubleMLPLR

dml_plr_tree = DoubleMLPLR(dml_data2,
                            ml_l = ml_l_xgb,
                            ml_m = ml_m_xgb)

In [None]:
def ml_l_params(trial):
     return {
         'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
         'max_depth': trial.suggest_int('max_depth', 3, 10),
         'min_child_weight': trial.suggest_int('min_child_weight', 1, 20) 
     }
 

def ml_m_params(trial):
     return {
         'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
         'max_depth': trial.suggest_int('max_depth', 3, 10),
         'min_child_weight': trial.suggest_int('min_child_weight', 1, 20) 
     }
 

param_space = {
     'ml_l': ml_l_params,
     'ml_m': ml_m_params
 }
 

optuna_settings = {
    'n_trials': 100,
    'show_progress_bar': True,
    'verbosity': optuna.logging.WARNING,  # Suppress Optuna logs
 }
 

dml_plr_tree.tune_ml_models(ml_param_space=param_space,
                             optuna_settings=optuna_settings,
 )
 

In [None]:
dml_plr_tree.fit()

dml_plr_tree.summary

In [None]:
groups = pd.DataFrame({
    'Education level': mics['helevel'].map({
        0: 'No education',
        1: 'Primary',
        2: 'Secondary or higher'
    })
})

gate_he = dml_plr_tree.gate(groups=groups)
gate_he.summary


In [None]:
groups = pd.DataFrame({
    'Area': mics['urban'].map({
        0: 'Rural',
        1: 'Urban'
    })
})

gate_urban = dml_plr_tree.gate(groups=groups)
gate_urban.summary

In [None]:
groups = pd.DataFrame({
    'Wealth Index': mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
})
gate_windex = dml_plr_tree.gate(groups=groups)
gate_windex.summary

In [None]:
groups = pd.DataFrame({
    'Edu_Area': (
        mics['helevel'].map({
            0: 'No education',
            1: 'Primary',
            2: 'Secondary or higher'
        })
        + ' | '
        + mics['urban'].map({
            0: 'Rural',
            1: 'Urban'
        })
        + ' | '
        + mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
    )
})

gate_both = dml_plr_tree.gate(groups=groups)
gate_both.summary


# Multinomial Treatment

## Y = VeryHighRiskHome

In [None]:
y1 = ['VeryHighRiskHome'] 
y2 = ['SomeRiskHome']
d_cols = [col for col in mics.columns if col.startswith('WS1_g_')]
exclude = y1 + y2 + d_cols + ['water_treatment_1']
x_cols = [col for col in mics.columns if col not in exclude]

In [None]:
from doubleml import DoubleMLData

dml_data3 = DoubleMLData(data=mics, 
                         y_col='VeryHighRiskHome', 
                         d_cols=d_cols,
                         x_cols=x_cols
                        )


In [None]:
from xgboost import XGBClassifier, XGBRegressor

ml_l_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34, n_jobs=-1)

ml_m_xgb = XGBClassifier(use_label_encoder=False,
                         objective="multi:softprob",  
                         eval_metric="mlogloss",      
                         num_class=5,                
                         eta=0.1, 
                         n_estimators=34,
                         n_jobs=-1)

In [None]:
from doubleml import DoubleMLPLR

dml_plr_tree_multi = DoubleMLPLR(dml_data3,
                            ml_l = ml_l_xgb,
                            ml_m = ml_m_xgb)


In [None]:
# def ml_l_params(trial):
#      return {
#          'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
#          'max_depth': trial.suggest_int('max_depth', 3, 10)
#      }
 

# def ml_m_params(trial):
#      return {
#          'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
#          'max_depth': trial.suggest_int('max_depth', 3, 10)
#      }
 

# param_space = {
#      'ml_l': ml_l_params,
#      'ml_m': ml_m_params
#  }
 

# optuna_settings = {
#     'n_trials': 100,
#     'show_progress_bar': True,
#     'verbosity': optuna.logging.WARNING,  # Suppress Optuna logs
#  }
 

# dml_plr_tree_multi.tune_ml_models(ml_param_space=param_space,
#                              optuna_settings=optuna_settings,
#  )
 

In [None]:
dml_plr_tree_multi.fit()

dml_plr_tree_multi.summary

In [None]:
groups = pd.DataFrame({
    'Education level': mics['helevel'].map({
        0: 'No education',
        1: 'Primary',
        2: 'Secondary or higher'
    })
})

gate_he = dml_plr_tree_multi.gate(groups=groups)
gate_he.summary


In [None]:
groups = pd.DataFrame({
    'Area': mics['urban'].map({
        0: 'Rural',
        1: 'Urban'
    })
})

gate_urban = dml_plr_tree_multi.gate(groups=groups)
gate_urban.summary

In [None]:
groups = pd.DataFrame({
    'Wealth Index': mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
})
gate_windex = dml_plr_tree_multi.gate(groups=groups)
gate_windex.summary

In [None]:
groups = pd.DataFrame({
    'Edu_Area': (
        mics['helevel'].map({
            0: 'No education',
            1: 'Primary',
            2: 'Secondary or higher'
        })
        + ' | '
        + mics['urban'].map({
            0: 'Rural',
            1: 'Urban'
        })
        + ' | '
        + mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
    )
})

gate_both = dml_plr_tree_multi.gate(groups=groups)
gate_both.summary


## Y = SomeRiskHome

In [None]:
from doubleml import DoubleMLData

dml_data4 = DoubleMLData(data=mics, 
                         y_col='SomeRiskHome', 
                         d_cols=d_cols,
                         x_cols=x_cols
                        )


In [None]:
from xgboost import XGBClassifier, XGBRegressor

ml_l_xgb = XGBClassifier(use_label_encoder = False ,
                        objective = "binary:logistic",
                        eval_metric = "logloss",
                        eta = 0.1, n_estimators = 34, n_jobs=-1)

ml_m_xgb = XGBClassifier(use_label_encoder=False,
                         objective="multi:softprob",  
                         eval_metric="mlogloss",      
                         num_class=5,                
                         eta=0.1, 
                         n_estimators=34,
                         n_jobs=-1)

In [None]:
from doubleml import DoubleMLPLR

dml_plr_tree_multi = DoubleMLPLR(dml_data4,
                            ml_l = ml_l_xgb,
                            ml_m = ml_m_xgb)


In [None]:
# def ml_l_params(trial):
#      return {
#          'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
#          'max_depth': trial.suggest_int('max_depth', 3, 10)
#      }
 

# def ml_m_params(trial):
#      return {
#          'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
#          'max_depth': trial.suggest_int('max_depth', 3, 10)
#      }
 

# param_space = {
#      'ml_l': ml_l_params,
#      'ml_m': ml_m_params
#  }
 

# optuna_settings = {
#     'n_trials': 100,
#     'show_progress_bar': True,
#     'verbosity': optuna.logging.WARNING,  # Suppress Optuna logs
#  }
 

# dml_plr_tree_multi.tune_ml_models(ml_param_space=param_space,
#                              optuna_settings=optuna_settings,
#  )
 

In [None]:
dml_plr_tree_multi.fit()

dml_plr_tree_multi.summary

In [None]:
groups = pd.DataFrame({
    'Education level': mics['helevel'].map({
        0: 'No education',
        1: 'Primary',
        2: 'Secondary or higher'
    })
})

gate_he = dml_plr_tree_multi.gate(groups=groups)
gate_he.summary


In [None]:
groups = pd.DataFrame({
    'Area': mics['urban'].map({
        0: 'Rural',
        1: 'Urban'
    })
})

gate_urban = dml_plr_tree_multi.gate(groups=groups)
gate_urban.summary

In [None]:
groups = pd.DataFrame({
    'Wealth Index': mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
})
gate_windex = dml_plr_tree_multi.gate(groups=groups)
gate_windex.summary

In [None]:
groups = pd.DataFrame({
    'Edu_Area': (
        mics['helevel'].map({
            0: 'No education',
            1: 'Primary',
            2: 'Secondary or higher'
        })
        + ' | '
        + mics['urban'].map({
            0: 'Rural',
            1: 'Urban'
        })
        + ' | '
        + mics['windex5'].map({
        0: 'Poorest',
        1: 'Poor',
        2: 'Middle',
        3: 'Rich',
        4: 'Richest'
    })
    )
})

gate_both = dml_plr_tree_multi.gate(groups=groups)
gate_both.summary
