In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV, LinearRegression,\
    ElasticNet, ElasticNetCV, MultiTaskElasticNet, MultiTaskElasticNetCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import mliv.dgps_mediated as dgps
from mliv.rkhs import ApproxRKHSIVCV, RKHSIVCV
from sklearn.pipeline import Pipeline

from pathlib import Path
import numpy as np
#from mliv.neuralnet.deepiv_fit import deep_iv_fit
from mliv.tsls import tsls, regtsls
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
from dml_longterm import DML_longterm
from dml_npiv import DML_npiv

In [None]:
df = pd.read_csv('../data/data_star.csv', header=0)

# Bind covariates
Xnan = df[["z_score_prior"]].values
X = df[["z_score_prior"]].values
G = df[["G"]].values
D = df[["D"]].values
S = df[["z_score3"]].values
V = df[["z_score_prior"]].values
Y = df[["z_score8"]].values


#drop NaNs
X = X[~np.isnan(Xnan)].reshape(-1,1)
G = G[~np.isnan(Xnan)].reshape(-1,1)
D = D[~np.isnan(Xnan)].reshape(-1,1)
S = S[~np.isnan(Xnan)].reshape(-1,1)
Y = Y[~np.isnan(Xnan)].reshape(-1,1)
V = V[~np.isnan(Xnan)].reshape(-1,1)

ones = np.ones((X.shape[0], 1))
 

In [None]:


dml_2sls = DML_longterm(Y=Y, D=D, S=S, G=G, X1=X,
                        estimator='MR',
                        longterm_model='latent_unconfounded',
                        model1 = ApproxRKHSIVCV(kernel_approx='nystrom', n_components=1000,
                           kernel='rbf', gamma=.1, delta_scale='auto',
                           delta_exp=.4, alpha_scales=np.geomspace(1, 10000, 10), cv=10),
                        model2 = ApproxRKHSIVCV(kernel_approx='nystrom', n_components=200,
                           kernel='rbf', gamma=.1, delta_scale='auto',
                           delta_exp=.4, alpha_scales=np.geomspace(1, 10000, 10), cv=10),
                n_folds=10, n_rep=1, prop_score=LogisticRegression(max_iter=1000))



delta_d1, delta_d0, nu_1, nu_0 = dml_2sls._nnpivfit_outcome_latent(Y=Y, D=D, S=S, X=X, G=G)


pr_d1_g0_x, pr_g1_d1_sx, pr_g1_d0_sx, pr_g1_x, _ = dml_2sls._propensity_score_latent(S_train=S, X_train=X, D_train=D, G_train=G,
                           S_test=S, X_test=X)


#Percentile 5 to 95 of X
X_test = np.linspace(np.percentile(X, 5), np.percentile(X, 95), 100)
X_test = X_test.reshape(-1, 1)
S_test = S.mean()*np.ones(X_test.shape[0]).reshape(-1, 1)


delta_d1_hat = delta_d1.predict(np.column_stack((S, X)))
delta_d0_hat = delta_d0.predict(np.column_stack((S, X)))

nu_1_hat = nu_1.predict(X)
nu_0_hat = nu_0.predict(X)
   

alfa_1_hat = (G * D * (1-pr_g1_d1_sx)) / (pr_g1_d1_sx * pr_d1_g0_x * (1-pr_g1_x))
alfa_0_hat = (G * (1-D) * (1-pr_g1_d0_sx)) / (pr_g1_d0_sx * (1-pr_d1_g0_x) * (1-pr_g1_x))

#IPW to residuals of approximation of second outcome bridge
eta_1_hat = ((1-G) * D ) / (pr_d1_g0_x * (1-pr_g1_x))
eta_0_hat = ((1-G) * (1-D) ) / ((1-pr_d1_g0_x) * (1-pr_g1_x))


y1_hat = nu_1_hat + alfa_1_hat * (Y - delta_d1_hat) + eta_1_hat * (delta_d1_hat - nu_1_hat)
y0_hat = nu_0_hat + alfa_0_hat * (Y - delta_d0_hat) + eta_0_hat * (delta_d0_hat - nu_0_hat)
psi_hat = y1_hat - y0_hat


plt.scatter(X,psi_hat)

print(psi_hat.mean())



In [None]:

hlo = LinearRegression().fit(X,psi_hat)


print(hlo.coef_)
print(hlo.intercept_)

In [None]:
import statsmodels.api as sm


# Calculate W.hat using logistic regression from LogisticRegression
logit = LogisticRegression().fit(X, D.flatten())
D_hat = logit.predict(X)
# Build a model for E(Y|X) using linear regression from LinearRegression
linreg = LinearRegression().fit(X, Y.flatten())
Y_hat = linreg.predict(X)



In [None]:

target = Y.flatten()-Y_hat
mean_forest_prediction = (D.flatten() - D_hat) * np.mean(psi_hat)
differential_forest_prediction = (D.flatten() - D_hat) * (psi_hat.flatten() - np.mean(psi_hat))

#Linear regression with no constant


LinearRegression(fit_intercept =False).fit(np.column_stack([mean_forest_prediction,differential_forest_prediction]), target).coef_


In [None]:


# Create a DataFrame
TC = pd.DataFrame({
    'target': Y.flatten()-Y_hat,
    'mean.forest.prediction': (D.flatten() - D_hat) * np.mean(psi_hat),
    'differential.forest.prediction': (D.flatten() - D_hat) * (psi_hat.flatten() - np.mean(psi_hat))
})


In [None]:

# Create the linear model
XX = TC[['mean.forest.prediction', 'differential.forest.prediction']]
yy = TC['target']
model = sm.OLS(yy, XX).fit()

# Get the summary of the linear model
print(model.summary())

In [None]:
percentiles = np.arange(5, 100, 5)
#Input the vector little v where wwe want to center local estimate theta
v_values = np.percentile(V, percentiles)


In [None]:

dml_2sls = DML_longterm(Y, D, S, G, X1=None, V=X, v_values = v_values,
                        estimator='MR',
                        loc_kernel='gau',
                        bw_loc='silverman',
                        longterm_model='surrogacy',
                        model1 = ApproxRKHSIVCV(kernel_approx='nystrom', n_components=200,
                                   kernel='rbf', gamma=.1, delta_scale='auto',
                                   delta_exp=.4, alpha_scales=np.geomspace(1, 10000, 10), cv=10),
                        model2 = ApproxRKHSIVCV(kernel_approx='nystrom', n_components=200,
                                   kernel='rbf', gamma=.1, delta_scale='auto',
                                   delta_exp=.4, alpha_scales=np.geomspace(1, 10000, 10), cv=10),
                n_folds=10, n_rep=1, CHIM = True, prop_score=LogisticRegression(max_iter=2000), opts = {'lin_degree': 3})

theta, vart, ci = dml_2sls.dml()

lower_ci = ci[:, 0]
upper_ci = ci[:, 1]
yerr = [theta - lower_ci, upper_ci - theta]

plt.figure(figsize=(7, 3))
plt.plot(theta)
plt.axhline(np.mean(theta, axis=0), linewidth=1)  # Adjust line properties as needed
plt.axhline(y=0, color='black', linestyle='--', linewidth=1)  # Adjust line properties as needed