## Load packages

In [58]:
!pip install numpy pandas econml scikit-learn doubleml

import numpy as np
import pandas as pd
from doubleml import DoubleMLPLR
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV



## Load Data

In [59]:
import pandas as pd

# Correct Lalonde dataset URL from a reliable GitHub repository (raw CSV data)
url = "https://raw.githubusercontent.com/robjellis/lalonde/master/lalonde_data.csv"

# Read CSV file into a Pandas DataFrame
df = pd.read_csv(url)

# Display first few rows
print(df.head())

# Show available column names
print(df.columns)

     ID  treat  age  educ  black  hispan  married  nodegree  re74  re75  \
0  NSW1      1   37    11      1       0        1         1   0.0   0.0   
1  NSW2      1   22     9      0       1        0         1   0.0   0.0   
2  NSW3      1   30    12      1       0        0         0   0.0   0.0   
3  NSW4      1   27    11      1       0        0         1   0.0   0.0   
4  NSW5      1   33     8      1       0        0         1   0.0   0.0   

         re78  
0   9930.0460  
1   3595.8940  
2  24909.4500  
3   7506.1460  
4    289.7899  
Index(['ID', 'treat', 'age', 'educ', 'black', 'hispan', 'married', 'nodegree',
       're74', 're75', 're78'],
      dtype='object')


In [60]:
# Define outcome (Y), treatment (A), and covariates (X)
Y = df['re78'].values
A = df['treat'].values
X = df.drop(columns=['treat', 're78', 'ID']).values  # Covariates

## Double Machine Learning

### Choose ML model

In [61]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

learner = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 5)
outcome_m = clone(learner)
treat_m = clone(learner)
outcome_m = clone(learner)
treat_m = clone(learner)

In [62]:
from doubleml import DoubleMLPLR
from doubleml import DoubleMLData
import numpy as np

np.random.seed(3141)
dml_data = DoubleMLData.from_arrays(X, Y, A)
dml_m = DoubleMLPLR(dml_data, outcome_m, treat_m)

dml_m .fit();

print(dml_m)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
Instrument variable(s): None
No. Observations: 614

------------------ Score & algorithm ------------------
Score function: partialling out

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=5, max_features='sqrt', n_estimators=500)
Learner ml_m: RandomForestRegressor(max_depth=5, max_features='sqrt', n_estimators=500)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[6878.9998315]]
Learner ml_m RMSE: [[0.33357665]]

------------------ Resampling        ------------------
No. folds: 5
No. repeated sample splits: 1

------------------ Fit summary       ------------------
         coef     std err         t     P>|t|       2.5 %       97.5 %
d  990.872539  778.702127  1.272467  0.203207 -535.355584  2517.100662


## Meta learners

In [54]:
from econml.metalearners import SLearner, TLearner, XLearner
# --- S-Learner ---
s_learner = SLearner(overall_model=RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42))
s_learner.fit(Y = Y, T = A, X= X)
cate_s = s_learner.effect(X)
print("S-Learner CATE (First 5):", cate_s[:5])

# --- T-Learner ---
t_learner = TLearner(models=RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42))
t_learner.fit(Y = Y, T = A, X= X)  # Corrected order
cate_t = t_learner.effect(X)
print("T-Learner CATE (First 5):", cate_t[:5])

# --- X-Learner ---
x_learner = XLearner(models=RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42))
x_learner.fit(Y = Y, T = A, X= X)  # Corrected order
cate_x = x_learner.effect(X)
print("X-Learner CATE (First 5):", cate_x[:5])




S-Learner CATE (First 5): [670.71754899 140.39480343 471.97778408 206.19948059 326.96483731]
T-Learner CATE (First 5): [4732.08646408  199.22156996 3979.21160404 2408.87200004 -533.88062383]
X-Learner CATE (First 5): [3889.45443558  836.09376826 3845.40167179 3146.94260228 2734.43873389]


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [57]:
# --- DR-Learner ---
from econml.dr import DRLearner
n = X.shape[0]  # Number of observations

outcome_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
pseudo_treatment_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6,
                                                  min_samples_leaf=int(n/100))

DR_learner = DRLearner(model_regression=outcome_model, model_propensity=propensity_model,
                       model_final=pseudo_treatment_model, cv=5)
# Train DR_learner
DR_learner.fit(Y = Y, T = A, X= X)
# Estimate treatment effects on test data
DR_te = DR_learner.effect(X)
print("DR-Learner CATE (First 5):", DR_te[:5])

DR-Learner CATE (First 5): [2591.26488645 1248.68121539 2332.85303044 2008.05655409 2163.0696579 ]
