# MICS Data Preprocessing

This notebook preprocesses the raw MICS data for DoubleML analysis:
- Defines outcome variables (Y), treatment variable (T), and covariates (X)
- Encodes ordinal variables with proper mappings
- One-hot encodes categorical variables
- Exports cleaned data (NaN values preserved for later handling)

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder

## 1. Load Raw Data

In [2]:
# Load raw data
mics = pd.read_csv("mics.csv", low_memory=False)

print(f"Dataset shape: {mics.shape}")
mics.head()

Dataset shape: (56721, 785)


Unnamed: 0,HH1,HH2,HINT,HH3,HH4,HH5D,HH5M,HH5Y,HH6,HH7,...,RiskHome_0_12,RiskSource_0_12,water_treatment3,Any_U5,Region,windex_ur,windex5_categ,helevel_temp,wq27_decile,SomeRiskHome
0,1,5,12.0,12,11,2,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Poor,,7,1
1,1,14,15.0,15,11,3,6. JUNE,2017,2. Rural,1. EAST,...,1,0,0,1,1,2,Poor,,1,1
2,1,22,15.0,15,11,4,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
3,2,3,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,1,1,2,Middle,,8,1
4,2,11,12.0,12,11,5,6. JUNE,2017,2. Rural,1. EAST,...,1,1,0,0,1,1,Poor,,8,1


## 2. Define Variable Groups

In [3]:
# ============================================================
# OUTCOME VARIABLES (Y)
# ============================================================
Y_cols = ["SomeRiskHome", "VeryHighRiskHome"]

# ============================================================
# TREATMENT VARIABLE (T)
# ============================================================
T_cols = ["water_treatment"]

# ============================================================
# BASIC CONTROLS
# ============================================================
# These are the core control variables used in the main analysis

# Ordinal (integer-encoded preserving order)
X_basic_ordinal = ["windex5", "helevel", "wq27_decile"]

# Categorical (one-hot encoded)
X_basic_categorical = ["country_cat", "WS1_g"]

# Binary
X_basic_binary = ["urban"]  # Will be created from HH6 column

# ============================================================
# EXTENDED CONTROLS (used in addition to basic controls)
# ============================================================
# These additional covariates are used in robustness checks

# Binary - Household composition
X_extended_household = ['Any_U5', 'Girls_less_than15', 'Boys_15or_less']

# Binary - Sanitation
X_extended_sanitation = ['improved_latrine', 'Flush', 'Pit_latrine', 'Open_defecation']

# Binary - Season & Water sources
X_extended_water = [
    'rainy_season', 'RainandSurfaceWater', 'PurchasedWater',
    'Basic_water_service', 'Limited_water_service', 'Unimproved_water_service',
    'ImprovedWaterSource', 'PipedWater', 'WellandSpringWater'
]

# Ordinal - Extended
X_extended_ordinal = ['water_carrier_edu']

# Combine all extended binary variables
X_extended_binary = X_extended_household + X_extended_sanitation + X_extended_water

## 3. Define Ordinal Mappings

In [4]:
# ============================================================
# Create urban variable from HH6 column
# ============================================================
# HH6 contains values like "1. Urban", "2. Rural"
# Extract urban/rural and map to binary (0=Rural, 1=Urban)
mics['urban'] = mics['HH6'].str.contains('Urban', case=False, na=False).astype(int)

# ============================================================
# Define ordinal mappings
# ============================================================
ordinal_mappings = {
    "helevel": {"No education": 0, "Primary": 1, "Secondary or higher": 2},
    "windex5": {"Poorest": 0, "Poor": 1, "Middle": 2, "Rich": 3, "Richest": 4},
}

## 4. Apply Ordinal Mappings

In [5]:
# Apply mappings for ordinal variables, handling NaN
for col, mapping in ordinal_mappings.items():
    if mics[col].dtype == 'object':
        mics[col] = mics[col].map(mapping).astype('Int64')  # Int64 allows NaN

# Handle water_carrier_edu (98 = missing)
# Replace with -1 sentinel value instead of NaN to preserve all rows
# -1 indicates "missing/not applicable" and all models can handle this value
mics['water_carrier_edu'] = mics['water_carrier_edu'].replace(98, -1).astype('Int64')

## 5. One-Hot Encode Categorical Variables

In [6]:
# One-hot encode categorical variables
X_categorical_cols = X_basic_categorical  # country_cat, WS1_g

if X_categorical_cols:
    onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_cats = onehot_encoder.fit_transform(mics[X_categorical_cols])
    encoded_df = pd.DataFrame(
        encoded_cats,
        columns=onehot_encoder.get_feature_names_out(X_categorical_cols)
    )
    
    # Concatenate with original data and drop original categorical columns
    mics = pd.concat([mics, encoded_df], axis=1)
    mics.drop(X_categorical_cols, axis=1, inplace=True)

## 6. Select Final Variables and Export

In [7]:
# Get the encoded categorical column names
encoded_cat_cols = list(onehot_encoder.get_feature_names_out(X_categorical_cols))

# ============================================================
# Assemble final covariate lists
# ============================================================

# Basic controls (for main analysis)
basic_X_cols = (
    X_basic_binary +           # urban
    X_basic_ordinal +          # windex5, helevel, wq27_decile
    encoded_cat_cols           # country_cat_*, WS1_g_*
)

# Extended controls (basic + additional)
extended_X_cols = (
    X_basic_binary +           # urban
    X_basic_ordinal +          # windex5, helevel, wq27_decile
    X_extended_binary +        # household, sanitation, water
    X_extended_ordinal +       # water_carrier_edu
    encoded_cat_cols           # country_cat_*, WS1_g_*
)

# Subsample analysis variable
subsample_var = ['RiskSource']

# Select all relevant columns for export (use extended set to preserve all options)
relevant_cols = Y_cols + T_cols + subsample_var + extended_X_cols
mics_clean = mics[relevant_cols]


mics_clean.isnull().sum().sort_values(ascending=False).head()

Open_defecation             1209
PipedWater                     3
PurchasedWater                 3
RainandSurfaceWater            3
Unimproved_water_service       3
dtype: int64

In [8]:
# Export cleaned data
mics_clean.to_csv("mics_clean.csv", index=False)