<a href="https://colab.research.google.com/github/hsalva2/XEMA/blob/main/Experimentos_Sesion_14b1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Double Machine Learning & Generalized Random Forests

In [None]:
!pip install econml

In [None]:
# Load libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from econml.dml import LinearDML, SparseLinearDML, CausalForestDML, NonParamDML
from econml.grf import CausalForest

In [None]:
# Load the experiment dataset
datos = pd.read_csv('https://raw.githubusercontent.com/carlosquintanillaa/Datasets/refs/heads/main/df1.csv')
nuevos = pd.read_csv('https://raw.githubusercontent.com/carlosquintanillaa/Datasets/refs/heads/main/df2.csv')

In [None]:
# Definicion de variables para datos de los que aprenderemos los modelos
y = datos['Y']
T = datos['T']
X = datos.drop(['id','Y','T'],axis=1)
X = pd.get_dummies(X,drop_first=True)

In [None]:
# Definir roles para datos nuevos
X2 = nuevos.drop(['id'],axis=1)
X2 = pd.get_dummies(X2,drop_first=True)

In [None]:
# Modelos para g(X,W) y m(X,W)
model_g = LinearRegression()
model_m = DummyClassifier(strategy='prior')
model_f = GradientBoostingRegressor()

In [None]:
# Modelo 01 : Linear DML
est1 = LinearDML(model_y=model_g, model_t=model_m,discrete_treatment=True)
est1.fit(y,T,X=X)
efecto1 = est1.effect(X2)
# CORREGI EL CRITERIO DE DECISION A EFFECTO1 > 0.75
decision1 = np.where(efecto1 > 0.75,1,0)

In [None]:
est1.summary()

In [None]:
# Modelo 02 : Sparse Linear DML
est2 = SparseLinearDML(model_y=model_g, model_t=model_m,discrete_treatment=True)
est2.fit(y,T,X=X)
efecto2 = est2.effect(X2)
# CORREGI EL CRITERIO DE DECISION A EFFECTO2 > 0.75
decision2 = np.where(efecto2 > 0.75,1,0)

In [None]:
est2.summary()

In [None]:
# Modelo 03 : CausalForestDML
est3 = CausalForestDML(model_y=model_g, model_t=model_m,discrete_treatment=True)
est3.fit(y,T,X=X)
efecto3 = est3.effect(X2)
# CORREGI EL CRITERIO DE DECISION A EFFECTO3 > 0.75
decision3 = np.where(efecto3 > 0.75,1,0)

In [None]:
# CausalForestDML tiene un output muy util: Cual es la importancia de las varaibles en la funcion final de CATE
feature_importances3 = est3.feature_importances_
feature_importances3

In [None]:
# Hagamoslo mas interpretable. Hagamos un DataFrame
feature_names = X2.columns.tolist()
feature_importances3 = est3.feature_importances_

# Convert to a DataFrame
feature_importances3_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances3
})

feature_importances3_df

In [None]:
# Ahora grafiquemoslo
feature_importances3_df = feature_importances3_df.sort_values('Importance', ascending=False)
sns.barplot(x='Importance', y='Feature', data=feature_importances3_df)

In [None]:
# Modelo 04 : NonParamDML (GradientBoostingRegressor)
est4 = NonParamDML(model_final=model_f, model_y=model_g, model_t=model_m,discrete_treatment=True)
est4.fit(y,T,X=X)
efecto4 = est4.effect(X2)
# CORREGI EL CRITERIO DE DECISION A EFFECTO4 > 0.75
decision4 = np.where(efecto4 > 0.75,1,0)

In [None]:
# Modelo 05 : Generalized Random Forest
est5 = CausalForest(random_state=1234)
est5.fit(X,T,y)
efecto5 = est5.predict(X2).ravel()
decision5 = np.where(efecto4 > 0.75,1,0)

In [None]:
# CausalForestDML tiene un output muy util: Cual es la importancia de las varaibles en la funcion final de CATE
feature_importances5 = est5.feature_importances_
feature_importances5

In [None]:
# Hagamoslo mas interpretable. Hagamos un DataFrame
feature_names = X2.columns.tolist()
feature_importances5 = est5.feature_importances_

# Convert to a DataFrame
feature_importances5_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances5
})

feature_importances5_df

In [None]:
# Ahora grafiquemoslo
feature_importances5_df = feature_importances5_df.sort_values('Importance', ascending=False)
sns.barplot(x='Importance', y='Feature', data=feature_importances5_df)

In [None]:
# Guardar Efectos y Decisiones
efectos = pd.DataFrame({'efecto1':efecto1,'efecto2':efecto2,'efecto3':efecto3,'efecto4':efecto4,'efecto5':efecto5})
decisiones = pd.DataFrame({'decision1':decision1,'decision2':decision2,'decision3':decision3,'decision4':decision4,'decision5':decision5})

In [None]:
# Cuales son los efectos heterogeneos de acuerdo a los distintos modelos?
# Prueben efecto1, efecto2, efecto3, efecto4. Cual aproxima mejor la verdadera funcion CATE = -2.5 + 5*Abs(X4)
sns.relplot(x=nuevos['X4'],y=efectos['efecto1'])