# DML-IV: Instrumental Variable with Base Station Count

This notebook addresses **endogeneity** using **Double Machine Learning with Instrumental Variables (DMLIV)**. The instrument is **total number of base stations**.

- **Treatment (T)**: Digital literacy (first-level index).
- **Outcome (Y)**: `kakwani_new`.
- **Instrument (Z)**: Base station count.
- **Covariates (X)**: Control variables.

---
## Setup: Imports and constants

In [1]:
import pandas as pd
import os
import numpy as np
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.exceptions import DataConversionWarning
from econml.iv.dml import DMLIV
from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
from scipy.stats import norm
from statsmodels.api import OLS, add_constant

warnings.filterwarnings('ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore', message='.*nonzero intercept.*', module='econml')

OUPUT_DIR = "./results"
if not os.path.exists(OUPUT_DIR):
    os.makedirs(OUPUT_DIR)


  from .autonotebook import tqdm as notebook_tqdm


---
## Load data and define variables

Data are read from `data/data.xlsx`.

In [2]:
df = pd.read_excel('data/data.xlsx')

X_cols_name = [
    'Gender', 'Age', 'Health status', 'Education level', 'Growing experience',
    'Marital status', 'Growing area', 'Labourer', 'Production facility', 'Storage facility',
    'Agricultural insurance', 'Loan', 'Social expenditure', 'Clan status', 'Natural disaster',
    'Training', 'Brand label usage', 'Logistics convenience', 'City'
]
D_cols_name = [
    'Digital literacy', 'Digital platform usage', 'Digital information acquisition', 'Digital platform usage'
]
Y_cols_name = ['kakwani_new', 'Household total income']
M_cols_name = ['Online social network', 'Entrepreneurship']
IV_cols_name = ['Base station count']

# Drop rows with missing instrument (required for IV identification)
df = df.dropna(subset=IV_cols_name).copy()
print(f"Sample size after dropping missing IV: {len(df)}")

X_df = df[X_cols_name]
D_df = df[D_cols_name]
Y_df = df[Y_cols_name]
M_df = df[M_cols_name]
IV_df = df[IV_cols_name]

Sample size after dropping missing IV: 1382


---
## Preprocessing: encodings

- **Covariates (X)**: One-hot encode `City`, keep other controls.


In [3]:
X = X_df.copy()
categorical_cols = ['City']
enc = OneHotEncoder(drop='first', sparse_output=False)
encoded_region = enc.fit_transform(X[categorical_cols])
encoded_region_df = pd.DataFrame(encoded_region, columns=enc.get_feature_names_out(categorical_cols))
X = pd.concat([X.drop(columns=categorical_cols).reset_index(drop=True), encoded_region_df], axis=1)

Z = IV_df.copy()
Z_encoded = OneHotEncoder(drop='first', sparse_output=False).fit_transform(Z)
Z_encoded_df = pd.DataFrame(Z_encoded, columns=[f'Base_station_{i}' for i in range(Z_encoded.shape[1])])

Y = Y_df['kakwani_new'].values

USE_T_STANDARDIZE = True
T_std = None
if USE_T_STANDARDIZE:
    T2_raw = D_df['Digital literacy'].values.copy()
    T_mean, T_std = T2_raw.mean(), T2_raw.std()
    if T_std < 1e-10:
        T_std = 1.0
    T2 = ((T2_raw - T_mean) / T_std).reshape(-1, 1)
    print(f"Treatment T standardized: mean = {T_mean:.4f}, std = {T_std:.4f}")
else:
    T2 = D_df['Digital literacy'].values.reshape(-1, 1)

Z_one_hot = Z_encoded_df.values


Treatment T standardized: mean = 0.2674, std = 0.2373


---
## DMLIV model

We fit **DMLIV** and first-stage strength is summarized by F-statistic.

In [4]:
X_train, X_test, T_train, T_test, Y_train, Y_test, Z_train, Z_test = train_test_split(
    X, T2, Y, Z_one_hot, test_size=0.3, random_state=42
)

HYPERPARAMS_DMLIV = {
    "n_estimators": 200,
    "min_samples_split": 10,
    "min_samples_leaf": 15,
    "max_samples": 0.35,
    "random_state": 2,
}

X_train_arr = np.asarray(X_train)
Z_train_arr = np.asarray(Z_train)

model_y_xw  = RandomForestRegressor(**HYPERPARAMS_DMLIV)
model_t_xw  = RandomForestRegressor(**HYPERPARAMS_DMLIV)
model_t_xwz = RandomForestRegressor(**HYPERPARAMS_DMLIV)

est = DMLIV(
    model_y_xw=model_y_xw,
    model_t_xw=model_t_xw,
    model_t_xwz=model_t_xwz,
    model_final=StatsModelsLinearRegression(fit_intercept=True),
    discrete_instrument=False,
    discrete_treatment=False,
    random_state=HYPERPARAMS_DMLIV["random_state"]
)
est.fit(Y=Y_train, T=T_train, Z=Z_train, X=X_train)

<econml.iv.dml._dml.DMLIV at 0x166346e10>

---
## First-stage strength and IV-ATE

We compute F-statistic for the first stage. Then we estimate IV-ATE.

In [5]:
T_hat = model_t_xwz.fit(np.hstack([X_train_arr, Z_train_arr]), T_train).predict(
    np.hstack([X_train_arr, Z_train_arr])
)
r2 = r2_score(T_train, T_hat)
T_model = OLS(T_train, add_constant(np.hstack([X_train_arr, Z_train_arr]))).fit()
f_stat = T_model.fvalue

te_pred = np.asarray(est.effect(X_test)).ravel()
avg_effect = np.mean(te_pred)
n_test = len(te_pred)
stderr = np.std(te_pred, ddof=1) / np.sqrt(n_test) if n_test > 1 else np.nan
if stderr == 0 or np.isnan(stderr):
    stderr = np.nan
    p_value = np.nan
    ci_low = ci_high = avg_effect
else:
    z_value = avg_effect / stderr
    p_value = 2 * (1 - norm.cdf(abs(z_value)))
    ci_low = avg_effect - 1.96 * stderr
    ci_high = avg_effect + 1.96 * stderr

if USE_T_STANDARDIZE and T_std is not None:
    avg_effect_report = avg_effect * T_std
    stderr_report = stderr * T_std if np.isfinite(stderr) else np.nan
    ci_low_report, ci_high_report = ci_low * T_std, ci_high * T_std
else:
    avg_effect_report, stderr_report = avg_effect, stderr
    ci_low_report, ci_high_report = ci_low, ci_high

print( "F =", round(f_stat, 2))
print("IV-ATE (effect per 1 unit Digital literacy on Kakwani):")
print(f"  IV-ATE ≈ {avg_effect_report:.4f}, 95% CI = ({ci_low_report:.4f}, {ci_high_report:.4f}), p ≈ {p_value}")

F = 15.37
IV-ATE (effect per 1 unit Digital literacy on Kakwani):
  IV-ATE ≈ -0.1008, 95% CI = (-0.1597, -0.0419), p ≈ 0.0008011254790094569


---
## Save and display results

In [6]:
results = [{
    "IV_ATE": avg_effect_report,
    "StdErr": stderr_report,
    "p_value": p_value,
    "F_stat": f_stat,
}]
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUPUT_DIR, "DMLIV_BaseStation_results.csv"), index=False, encoding="utf-8-sig")
print("Results saved to DMLIV_BaseStation_results.csv")
display(results_df)

Results saved to DMLIV_BaseStation_results.csv


Unnamed: 0,IV_ATE,StdErr,p_value,F_stat
0,-0.10076,0.030056,0.000801,15.36519
