In [2]:
#install.packages(c("readr", "dplyr", "hdm", "glmnet", "randomForest", "sandwich", "nnet"))

In [3]:
library(readr)       
library(dplyr)       
library(hdm)        
library(glmnet)      
library(randomForest) 
library(sandwich)   
library(nnet)      
set.seed(123)  

In [4]:
nombres <- c(
    "abdt", "tg", "inuidur1", "inuidur2", "female", "black", "hispanic", 
    "othrace", "dep", "q1", "q2", "q3", "q4", "q5", "q6", "recall", 
    "agelt35", "agegt54", "durable", "nondurable", "lusd", "husd", "muld"
)

In [5]:
url <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat"
df <- read_delim(url, delim = " ", col_names = nombres, skip = 1, trim_ws = TRUE)

[1mRows: [22m[34m13913[39m [1mColumns: [22m[34m23[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m " "
[32mdbl[39m (23): abdt, tg, inuidur1, inuidur2, female, black, hispanic, othrace, de...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [18]:
output_dir <- "C:/Users/user2/Documents/GitHub/DML_NN/r/output"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}
output_path <- file.path(output_dir, "comparison_table.csv")

# limpieza

In [6]:
df_cleaned <- df %>%
  filter(tg == 0 | tg == 4) %>%
  mutate(
    T4 = as.integer(tg == 4),
    
    y = log(inuidur1),
    
    dep_1 = as.integer(dep == 1),
    dep_2 = as.integer(dep == 2)
  ) %>%
  filter(is.finite(y))

In [7]:
feature_list <- c(
    "female", "black", "othrace",
    "dep_1", "dep_2",
    "q2", "q3", "q4", "q5", "q6",
    "recall", "agelt35", "agegt54",
    "durable", "nondurable", "lusd", "husd"
)

In [8]:
x <- as.matrix(df_cleaned[, feature_list])
y <- df_cleaned$y
d <- df_cleaned$T4

In [9]:
# DML CON CROSS FITTNG
DML.for.PLM <- function(x, d, y, dreg, yreg, nfold=10, d_pred_type="response", y_pred_type="response") {
  nobs <- nrow(x)
  foldid <- rep.int(1:nfold, times = ceiling(nobs/nfold))[sample.int(nobs)]
  I <- split(1:nobs, foldid)
  ytil <- dtil <- rep(NA, nobs)
  
  cat("Fold: ")
  for(b in 1:length(I)){
    dfit <- dreg(x[-I[[b]],], d[-I[[b]]])
    yfit <- yreg(x[-I[[b]],], y[-I[[b]]])
        
    # Manejar la predicción de D (tratamiento)
    if (d_pred_type == "prob") {
        dhat <- predict(dfit, x[I[[b]],], type="prob")[, 2]
    } else if (d_pred_type == "raw" && inherits(dfit, "nnet")) {
        pred_raw <- predict(dfit, x[I[[b]],], type="raw")
        dhat <- if (ncol(pred_raw) == 2) pred_raw[, 2] else pred_raw[, 1]
    } else {
        dhat <- predict(dfit, x[I[[b]],], type=d_pred_type)
    }
    
    yhat <- predict(yfit, x[I[[b]],], type=y_pred_type)
    
    # Calcular residuales
    dtil[I[[b]]] <- (d[I[[b]]] - dhat)
    ytil[I[[b]]] <- (y[I[[b]]] - yhat)
    cat(b, " ")
  }
  
  # Regresión final de residuales
  rfit <- lm(ytil ~ dtil)
  coef.est <- coef(rfit)[2]
  se <- sqrt(vcovHC(rfit)[2,2]) # Error estándar robusto
  
  cat(sprintf("\nCoef (SE) = %g (%g)\n", coef.est, se))
  return( list(coef.est=coef.est , se=se, dtil=dtil, ytil=ytil) )
}

In [10]:
#NAIVE
DML.naive <- function(x, d, y, dreg, yreg, d_pred_type="response", y_pred_type="response") {
  
  #  Entrenar y predecir Y (en muestra)
  yfit <- yreg(x, y)
  yhat <- predict(yfit, x, type=y_pred_type)
  
  # Entrenar y predecir D (en muestra)
  dfit <- dreg(x, d)
  
  if (d_pred_type == "prob") {
      dhat <- predict(dfit, x, type="prob")[, 2]
  } else if (d_pred_type == "raw" && inherits(dfit, "nnet")) {
      pred_raw <- predict(dfit, x, type="raw")
      dhat <- if (ncol(pred_raw) == 2) pred_raw[, 2] else pred_raw[, 1]
  } else {
      dhat <- predict(dfit, x, type=d_pred_type)
  }

  #  Calcular residuales
  dtil <- (d - dhat)
  ytil <- (y - yhat)
  
  #  Regresión final
  rfit <- lm(ytil ~ dtil)
  coef.est <- coef(rfit)[2]
  se <- sqrt(vcovHC(rfit)[2,2])
  
  cat(sprintf("\nCoef (SE) = %g (%g)\n", coef.est, se))
  return( list(coef.est=coef.est , se=se, dtil=dtil, ytil=ytil) )
}

## EJECUTAR CROSS FITTING

In [12]:
# OLS/Logit
yreg_ols <- function(x, y) glmnet(x, y, lambda = 0)
dreg_ols <- function(x, d) glmnet(x, d, family = "binomial", lambda = 0)

# Lasso
yreg_lasso <- function(x, y) rlasso(x, y, post=FALSE) # De hdm
dreg_lasso <- function(x, d) cv.glmnet(x, d, family = "binomial", alpha = 1)

# Random Forest
yreg_rf <- function(x, y) randomForest(x, y) #
dreg_rf <- function(x, d) randomForest(x, as.factor(d))

# Neural Net 
yreg_nn <- function(x, y) nnet(x, y, size=20, linout=TRUE, trace=FALSE, MaxNWts=2000)
dreg_nn <- function(x, d) nnet(x, class.ind(as.factor(d)), size=20, trace=FALSE, MaxNWts=2000, softmax=TRUE)
cat("\n DML con OLS/Logit \n")
res_ols <- DML.for.PLM(x, d, y, dreg_ols, yreg_ols, nfold=10, 
                       d_pred_type="response", y_pred_type="response")

cat("\n DML con Lasso \n")
res_lasso <- DML.for.PLM(x, d, y, dreg_lasso, yreg_lasso, nfold=10, 
                         d_pred_type="response", y_pred_type="response")

cat("\n DML con Random Forest \n")
res_rf <- DML.for.PLM(x, d, y, dreg_rf, yreg_rf, nfold=10, 
                      d_pred_type="prob", y_pred_type="response")

cat("\n DML con Neural Net \n")
res_nn <- DML.for.PLM(x, d, y, dreg_nn, yreg_nn, nfold=10, 
                      d_pred_type="raw", y_pred_type="raw")

# Crear Tabla de Resultados 
table_dml <- data.frame(
  Estimate = c(res_ols$coef.est, res_lasso$coef.est, res_rf$coef.est, res_nn$coef.est),
  `Std. Error` = c(res_ols$se, res_lasso$se, res_rf$se, res_nn$se),
  `RMSE Y` = c(sqrt(mean(res_ols$ytil^2)), sqrt(mean(res_lasso$ytil^2)), sqrt(mean(res_rf$ytil^2)), sqrt(mean(res_nn$ytil^2))),
  `RMSE D` = c(sqrt(mean(res_ols$dtil^2)), sqrt(mean(res_lasso$dtil^2)), sqrt(mean(res_rf$dtil^2)), sqrt(mean(res_nn$dtil^2))),
  row.names = c("OLS/Logit", "Lasso", "Random Forest", "NN (nnet)"),
  check.names = FALSE
)

cat("\n Resultados DML (Cross-Fit) \n")
print(table_dml, digits=4)


 DML con OLS/Logit 


Fold: 1  2  3  4  5  6  7  8  9  10  
Coef (SE) = -0.0733942 (0.0351569)

 DML con Lasso 
Fold: 1  2  3  4  5  6  7  8  9  10  
Coef (SE) = -0.0810184 (0.0353601)

 DML con Random Forest 
Fold: 1  2  3  4  5  6  7  8  9  10  
Coef (SE) = -0.0768315 (0.0343515)

 DML con Neural Net 
Fold: 1  2  3  4  5  6  7  8  9  10  
Coef (SE) = -0.0936142 (0.0347956)

 Resultados DML (Cross-Fit) 
              Estimate Std. Error RMSE Y RMSE D
OLS/Logit     -0.07339    0.03516  1.196 0.4753
Lasso         -0.08102    0.03536  1.199 0.4746
Random Forest -0.07683    0.03435  1.208 0.5402
NN (nnet)     -0.09361    0.03480  1.235 0.5017


## EJECUTAR SIN CROSS FITTING

In [13]:
cat("\n Naive DML con OLS/Logit \n")
res_ols_n <- DML.naive(x, d, y, dreg_ols, yreg_ols, 
                       d_pred_type="response", y_pred_type="response")

cat("\n Naive DML con Lasso \n")
res_lasso_n <- DML.naive(x, d, y, dreg_lasso, yreg_lasso, 
                         d_pred_type="response", y_pred_type="response")

cat("\n Naive DML con Random Forest \n")
res_rf_n <- DML.naive(x, d, y, dreg_rf, yreg_rf, 
                      d_pred_type="prob", y_pred_type="response")

cat("\n Naive DML con Neural Net \n")
res_nn_n <- DML.naive(x, d, y, dreg_nn, yreg_nn, 
                      d_pred_type="raw", y_pred_type="raw")

#  Crear Tabla de Resultados (Parte III) 
table_naive <- data.frame(
  Estimate = c(res_ols_n$coef.est, res_lasso_n$coef.est, res_rf_n$coef.est, res_nn_n$coef.est),
  `Std. Error` = c(res_ols_n$se, res_lasso_n$se, res_rf_n$se, res_nn_n$se),
  `RMSE Y` = c(sqrt(mean(res_ols_n$ytil^2)), sqrt(mean(res_lasso_n$ytil^2)), sqrt(mean(res_rf_n$ytil^2)), sqrt(mean(res_nn_n$ytil^2))),
  `RMSE D` = c(sqrt(mean(res_ols_n$dtil^2)), sqrt(mean(res_lasso_n$dtil^2)), sqrt(mean(res_rf_n$dtil^2)), sqrt(mean(res_nn_n$dtil^2))),
  row.names = c("OLS/Logit", "Lasso", "Random Forest", "NN (nnet)"),
  check.names = FALSE
)

cat("\n Resultados Naive DML (Sin Cross-Fit) \n")
print(table_naive, digits=4)


 Naive DML con OLS/Logit 

Coef (SE) = -0.0726354 (0.0351328)

 Naive DML con Lasso 

Coef (SE) = -0.0799962 (0.0352651)

 Naive DML con Random Forest 

Coef (SE) = -0.0771848 (0.034638)

 Naive DML con Neural Net 

Coef (SE) = -0.0823601 (0.0355448)

 Resultados Naive DML (Sin Cross-Fit) 
              Estimate Std. Error RMSE Y RMSE D
OLS/Logit     -0.07264    0.03513  1.190 0.4734
Lasso         -0.08000    0.03527  1.195 0.4745
Random Forest -0.07718    0.03464  1.116 0.5064
NN (nnet)     -0.08236    0.03554  1.133 0.4477


## COMPARACIÓN 

In [19]:
table_dml$Method <- "DML (Cross-Fit)"
table_naive$Method <- "Naive (No Cross-Fit)"

comparison_table <- rbind(table_dml, table_naive)
comparison_table <- comparison_table[order(rownames(comparison_table)), ] # Ordenar por nombre de modelo

print(comparison_table, digits=4)
write.csv(comparison_table, output_path, row.names = TRUE)

               Estimate Std. Error RMSE Y RMSE D               Method
Lasso          -0.08102    0.03536  1.199 0.4746      DML (Cross-Fit)
Lasso1         -0.08000    0.03527  1.195 0.4745 Naive (No Cross-Fit)
NN (nnet)      -0.09361    0.03480  1.235 0.5017      DML (Cross-Fit)
NN (nnet)1     -0.08236    0.03554  1.133 0.4477 Naive (No Cross-Fit)
OLS/Logit      -0.07339    0.03516  1.196 0.4753      DML (Cross-Fit)
OLS/Logit1     -0.07264    0.03513  1.190 0.4734 Naive (No Cross-Fit)
Random Forest  -0.07683    0.03435  1.208 0.5402      DML (Cross-Fit)
Random Forest1 -0.07718    0.03464  1.116 0.5064 Naive (No Cross-Fit)


### Pregunta 1: What can you say about the RMSE for predicting y and d?

**Respuesta:** Al ver la tabla final, se ve claramente que los valores de `rmse_y` y `rmse_d` son  más bajos en el método "Naive (No Cross-Fit)" que en el método "DML (Cross-Fit)". Esta diferencia es mucho más pronunciada en los modelos flexibles como Random Forest y NN. Esto es por el overfitting!!

### Pregunta 2: Why is it that estimating with one function yields lower RMSE than another?

**Respuesta:** Esto se debe al **sobreajuste (overfitting)**.

* La función **Naive (`dml_naive`)** entrena y evalúa el modelo en el  misma data. Los modelos complejos (especialmente RF y NN) son muy buenos para "memorizar" los datos de entrenamiento, incluido el noise. Esto da como resultado un RMSE artificialmente bajo y excesivamente optimista
* La función **DML (`dml`)** usa **cross-fitting (ajuste cruzado)**. Entrena el modelo en una parte de los datos(pliegues) y lo evalúa en una parte que no se ha visto anteriormente. Este RMSE de cross fitting es una medida válida y realista del poder predictivo del modelo en datos nuevos.

### Pregunta 3: What problem would we have if we chose to estimate without cross-fitting?

**Respuesta:** El problema principal es el **sesgo por sobreajuste (overfitting bias)**.

La teoría de DML (Double Machine Learning) requiere que los residuos (`resy` y `resD`) se generen de una manera que sea "ortogonal" (estadísticamente independiente) del proceso de estimación. El **cross-fitting** es el mecanismo que nos permite lograr esto.

En resumen, **sin cross-fitting, sacrificamos la validez estadística y la inferencia causal correcta por un falso sentido de precisión (de un RMSE más bajo) que proviene del overfitting, pero que de probarlo con data no "aprendida" resultara en estimaciones erroneas por overfitting**