# Heterogeneous Treatment Effects

In [1]:
import pandas as pd
import numpy as np
from econml.dml import CausalForestDML
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from econml.dml import LinearDML
import statsmodels.formula.api as smf

## Simulate Data

In [4]:
np.random.seed(42)
n = 1000

# Features
age = np.random.normal(40, 10, n)
engagement = np.random.uniform(0, 1, n)
X = np.column_stack([age, engagement])

# Treatment assignment (random)
T = np.random.binomial(1, 0.5, n)

# True treatment effect varies by engagement
true_te = 2 + 3 * engagement
Y = true_te * T + age * 0.1 + np.random.normal(0, 1, n)

df = pd.DataFrame({'age': age, 'engagement': engagement, 'T': T, 'Y': Y})
df.head()

Unnamed: 0,age,engagement,T,Y
0,44.967142,0.167483,0,3.890014
1,38.617357,0.104568,0,4.073019
2,46.476885,0.63643,0,5.847767
3,55.230299,0.706476,0,5.031127
4,37.658466,0.031586,1,3.984052


## Causal Forest

In [5]:
est = CausalForestDML(
    model_t=LogisticRegression(),
    model_y=RandomForestRegressor(),
    n_estimators=1000,
    min_samples_leaf=10,
    max_depth=10,
    random_state=42,
    discrete_treatment=True,
)

est.fit(Y, T, X=X)

<econml.dml.causal_forest.CausalForestDML at 0x18a5fe172c0>

In [None]:
feat_imp = est.feature_importances_
print(list(zip(['age', 'engagement'], feat_imp)))

# 

[('age', np.float64(0.0744641328773366)), ('engagement', np.float64(0.9255358671226634))]


- Will always add up to 100%, doesn't even need to be important to the outcome
- Engagement is 93% of importance, Age is 7% of importance
- The treatmetn effect varies more with engagement than age, it's relative

## T-Learner Insights

In [None]:
# Treated
model_treated = RandomForestRegressor().fit(X[T == 1], Y[T == 1])
imp_treated = model_treated.feature_importances_

# Control
model_control = RandomForestRegressor().fit(X[T == 0], Y[T == 0])
imp_control = model_control.feature_importances_

# Compare
print("Feature Importances Under Treatment:")
print(dict(zip(['age', 'engagement'], imp_treated)))

print("Feature Importances Under Control:")
print(dict(zip(['age', 'engagement'], imp_control)))

Feature Importances Under Treatment:
{'age': np.float64(0.5584193851058766), 'engagement': np.float64(0.4415806148941235)}
Feature Importances Under Control:
{'age': np.float64(0.7689423496270436), 'engagement': np.float64(0.23105765037295647)}


- Age is more important than engagement with the control customers
- Engagement is more important than age with the treated customers

In [None]:
# Create interaction term
df['TxE'] = df['T'] * df['engagement']

# Regression to estimate interaction effect
model = LinearRegression().fit(df[['T', 'engagement', 'TxE']], df['Y'])

print(f"TxE coefficient (interaction): {model.coef_[2]:.3f}")


TxE coefficient (interaction): 2.364


- This shows the direction of the effect for different features
- Engagement has a strong positive effect on treatment effect

In [None]:
est = LinearDML(
    model_y=RandomForestRegressor(),
    model_t=LogisticRegression(),
    discrete_treatment=True,
)
est.fit(Y, T, X=X)  # X includes engagement

# Get coefficient summary
est.summary()

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
X0,-0.007,0.008,-0.872,0.383,-0.024,0.009
X1,1.997,0.31,6.442,0.0,1.389,2.604

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,2.586,0.372,6.945,0.0,1.857,3.316


- The coefficient on engagement is 1.997 and the p-value is 0.0, indicating a strong positive relationship between engagement and treatment effect.

#

## Causal DML for Insights

In [2]:
def run_experiment(Y, T, X, features_names):
    model = LinearDML(
        model_y=RandomForestRegressor(),
        model_t=LogisticRegression(),
        discrete_treatment=True,
    )
    model.fit(Y, T, X=X)

    coefs = model.coef_.flatten()
    results = {}
    for name, coef in zip(features_names, coefs):
        results[name] = coef
    return results

In [3]:
def summarize_heterogeneity(results, threshold=0.05):
    insights = []
    for feature, coef in results.items():
        if abs(coef) > threshold:
            direction = "increases" if coef > 0 else "decreases"
            insights.append(f"As {feature} increases, treatment effect {direction}.")
    if not insights:
        return "No targeting indicators, no features important in decision to treat."
    return " ".join(insights)

## Statsmodels for Insights

In [6]:
def get_effect_summary(df, treatment='T', outcome='Y', features=['age', 'engagement'], threshold=0.05):
    # Build formula: Y ~ T + age + engagement + T:age + T:engagement
    interactions = ' + '.join([f"{treatment}:{f}" for f in features])
    formula = f"{outcome} ~ {treatment} + {' + '.join(features)} + {interactions}"
    model = smf.ols(formula, data=df).fit()

    insights = []
    for f in features:
        coef = model.params.get(f'{treatment}:{f}', 0)
        if abs(coef) >= threshold:
            direction = 'increases' if coef > 0 else 'decreases'
            insights.append(f"As {f} increases, treatment effect {direction}.")
    if not insights:
        return "No targeting indicators, no features important in decision to treat."
    return ' '.join(insights)

In [4]:
# Experiment 1
"As age increases, treatment effect increases."

# Experiment 2
"As engagement increases, treatment effect decreases."

# Experiment 3
"No targeting indicators, no features important in decision to treat."

# Experiment 4
"As age increases, treatment effect decreases. As engagement increases, treatment effect increases."


'As age increases, treatment effect decreases. As engagement increases, treatment effect increases.'