In [18]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import warnings
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegressionCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier

In [14]:
file_path = r'C:\Users\user2\Documents\GitHub\DML_NN\input\penn_jae.dat'

## I. Cleaning and set-up


In [11]:
nombres = [
    'abdt', 'tg', 'inuidur1', 'inuidur2', 'female', 'black', 'hispanic', 
    'othrace', 'dep', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'recall', 
    'agelt35', 'agegt54', 'durable', 'nondurable', 'lusd', 'husd', 'muld'
]

In [16]:
df = pd.read_csv(file_path, delim_whitespace=True, names=nombres,header=0)
df

  df = pd.read_csv(file_path, delim_whitespace=True, names=nombres,header=0)


Unnamed: 0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,...,q5,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld
0,10824,0,18,18,0,0,0,0,2,0,...,1,0,0,0,0,0,0,0,1,0
1,10635,2,7,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,10551,5,18,6,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,10824,0,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,10747,0,27,27,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13908,10831,5,27,27,0,0,0,0,0,0,...,1,0,1,0,1,1,0,0,1,0
13909,10677,2,4,4,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
13910,10817,4,4,4,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
13911,10691,0,27,27,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [None]:
    # I. Mantener solo observaciones donde 'tg' es 0 y 4
    # ---
    df_cleaned = df[df['tg'].isin([0, 4])].copy()
    print(f"Datos después de filtrar por 'tg' (0 y 4): {df_cleaned.shape}")

    # ---
    # II. Definir la variable de tratamiento 'T4' (d)
    # (1 si tg == 4, 0 si tg == 0)
    # ---
    df_cleaned['T4'] = (df_cleaned['tg'] == 4).astype(int)

    # ---
    # III. Definir la variable de resultado 'y' (logaritmo natural de 'inuidur1')
    # ---
    # Nota: Si 'inuidur1' puede ser 0 o negativo, esto creará -inf o NaN.
    # Una alternativa común es np.log1p(df_cleaned['inuidur1']), que es log(1 + inuidur1).
    # Por ahora, seguimos la instrucción exacta de log(inuidur1).
    df_cleaned['y'] = np.log(df_cleaned['inuidur1'])
    
# Manejar posibles valores infinitos si inuidur1 era 0
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)
if df_cleaned['y'].isnull().any():
    print("Advertencia: Se encontraron valores 0 o negativos en 'inuidur1', resultando en NaN.")
        

# IV. Crear variables dummy para 'dep'
# Esto crea columnas como dep_0, dep_1, dep_2
dep_dummies = pd.get_dummies(df_cleaned['dep'], prefix='dep', dtype=int)
    
# Unir los dummies al DataFrame principal
    df_cleaned = pd.concat([df_cleaned, dep_dummies], axis=1)


feature_list = [
        'female', 'black', 'othrace',
        'dep_1', 'dep_2',  # dep_0 se omite como categoría de referencia
        'q2', 'q3', 'q4', 'q5', 'q6', # q1 se omite como categoría de referencia
        'recall', 'agelt35', 'agegt54',
        'durable', 'nondurable', 'lusd', 'husd']
    
x = df_cleaned[feature_list]
y = df_cleaned['y']
d = df_cleaned['T4']

Datos después de filtrar por 'tg' (0 y 4): (5099, 23)


In [None]:
x

## II. Debiased ML

In [21]:
def dml(X, D, y, modely, modeld, *, nfolds, classifier=False):
    # 1. configurar el K-Fold para cross-fitting
    cv = KFold(n_splits=nfolds, shuffle=True, random_state=123)
    
    # 2. Obtener predicciones fuera de muestra (out-of-fold) para y
    yhat = cross_val_predict(modely, X, y, cv=cv, n_jobs=-1)
    
    # 3. Obtener predicciones fuera de muestra para D
    # Dhat = E[D | X]
    if classifier:
        # Usar 'predict_proba' para obtener la probabilidad, que es E[D|X]
        Dhat = cross_val_predict(modeld, X, D, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
    else:
        Dhat = cross_val_predict(modeld, X, D, cv=cv, n_jobs=-1)
        
    # 4. Calcular los residuales
    # resy = y - E[y | X]
    # resD = D - E[D | X]
    resy = y - yhat
    resD = D - Dhat
    
    # 5. Regresión final de residuales
    # Estimar alpha en: resy = alpha * resD + error
    dml_data = pd.DataFrame({'resy': resy, 'resD': resD})
    ols_mod = smf.ols(formula='resy ~ 1 + resD', data=dml_data).fit()
    
    point = ols_mod.params[1]
    stderr = ols_mod.bse[1]
    epsilon = ols_mod.resid
    
    return point, stderr, yhat, Dhat, resy, resD, epsilon

In [22]:
def summary(point, stderr, yhat, Dhat, resy, resD, epsilon, X, D, y, *, name):
    '''
    Función de resumen para los resultados de DML.
    '''
    return pd.DataFrame({'estimate': point, # estimación puntual
                         'stderr': stderr, # error estándar
                         'rmse y': np.sqrt(np.mean(resy**2)), # RMSE del modelo de resultado
                         'rmse D': np.sqrt(np.mean(resD**2)) # RMSE del modelo de tratamiento
                         }, index=[name])

In [26]:

# (OLS para 'y', Logística para 'd')
modely_ols = make_pipeline(StandardScaler(), LinearRegression())
modeld_ols = make_pipeline(StandardScaler(), LogisticRegressionCV(cv=5, random_state=123))

result_OLS = dml(x, d, y, modely_ols, modeld_ols, nfolds=10, classifier=True)
table_OLS = summary(*result_OLS, x, d, y, name='OLS/Logit')


# (LassoCV para 'y', Regresión Logística L1 para 'd')
modely_lasso = make_pipeline(StandardScaler(), LassoCV(cv=5, random_state=123))
modeld_lasso = make_pipeline(StandardScaler(), LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', random_state=123))

result_Lasso = dml(x, d, y, modely_lasso, modeld_lasso, nfolds=10, classifier=True)
table_Lasso = summary(*result_Lasso, x, d, y, name='Lasso')

#RANDOM FOREST
modely_rf = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=123))
modeld_rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, min_samples_leaf=5, random_state=123))

result_RF = dml(x, d, y, modely_rf, modeld_rf, nfolds=10, classifier=True)
table_RF = summary(*result_RF, x, d, y, name='Random Forest')

# Usamos red neuronal
modely_nn = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(50, 20), max_iter=500, random_state=123, early_stopping=True))
modeld_nn = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=500, random_state=123, early_stopping=True))

result_NN = dml(x, d, y, modely_nn, modeld_nn, nfolds=10, classifier=True)
table_NN = summary(*result_NN, x, d, y, name='NN (MLP)')
table_NN

  point = ols_mod.params[1]
  stderr = ols_mod.bse[1]
  point = ols_mod.params[1]
  stderr = ols_mod.bse[1]
  point = ols_mod.params[1]
  stderr = ols_mod.bse[1]
  point = ols_mod.params[1]
  stderr = ols_mod.bse[1]


In [None]:
# ver tabla
table = pd.concat([table_OLS, table_Lasso, table_RF, table_NN], axis=0)
table_sorted = table.sort_values(by=['rmse y', 'rmse D'])
print(table_sorted)

               estimate    stderr    rmse y    rmse D
Lasso         -0.078471  0.034807  1.195067  0.487205
OLS/Logit     -0.072275  0.035244  1.195097  0.474773
NN (MLP)      -0.059167  0.035424  1.209569  0.478098
Random Forest -0.074383  0.035691  1.217272  0.477508


## III. No cross-fitting
