In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from doc.mse_estimator import ErrorComparer
from doc.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from doc.plotting_utils import gen_model_barplots
from spe.estimators import kfoldcv, kmeanscv, better_test_est_split, cp_general_train_test, by_spatial
from spe.relaxed_lasso import RelaxedLasso

In [2]:
np.random.seed(1)

In [3]:
## number of realizations to run
niter = 100

## data generation parameters
n=10**2
p=200
s=5
delta = 0.75
snr = .4
tr_frac = .25

noise_kernel = 'matern'
noise_length_scale = .25
noise_nu = 1.5

X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5

## ErrorComparer parameters
alpha = .05
nboot = 100
k = 5

# models = [LassoCV(alphas=[.1, .25, .5, 1.])]
models = [Lasso(alpha=.1)]
# models = [RelaxedLasso(lambd=.31)]
# models = [XGBRegressor(n_estimators=25, max_depth=3)]
# models = [DecisionTreeRegressor(max_depth=3)]
# models = [LinearRegression(fit_intercept=False)]

ests = [
    better_test_est_split,
    better_test_est_split,
    cp_general_train_test,
    cp_general_train_test,
    by_spatial,
    by_spatial,
    by_spatial,
    # kfoldcv, 
    # kmeanscv
]
est_kwargs = [
    {'alpha': None,
    'full_refit': False},
    {'alpha': 1.,
    'full_refit': False},
    {'alpha': alpha, 
    'use_trace_corr': False, 
    'nboot': nboot},
    {'alpha': 1., 
    'use_trace_corr': False, 
    'nboot': nboot},
    {'alpha': .05, 
    'nboot': nboot},
    {'alpha': 1., 
    'nboot': nboot},
    {'alpha': 5., 
    'nboot': nboot},
    # {'k': k},
    # {'k': k}
]

## plot parameters
title = "General Model Comparisons"
# est_names = ["GenCp", "BY .05", "BY 5."]
est_names = ["GenCp .05", "GenCp 1.", "BY .05", "BY 1.", "BY 5."]
model_names = ["Lasso"]

In [4]:
err_cmp = ErrorComparer()

In [5]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, 10, nx)
ys = np.linspace(0, 10, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T

In [6]:
if noise_kernel == 'rbf':
    Sigma_t = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
    Sigma_t = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
    Sigma_t = np.eye(n)
    
Cov_st = delta*Sigma_t
Sigma_t = delta*Sigma_t + (1-delta)*np.eye(n)

if noise_kernel == 'rbf' or noise_kernel == 'matern':
    Chol_t = np.linalg.cholesky(Sigma_t)
else:
    Chol_t = np.eye(n)

In [7]:
if X_kernel == 'rbf':
    Sigma_X = gen_cov_mat(c_x, c_y, RBF(length_scale=X_length_scale))
elif X_kernel == 'matern':
    Sigma_X = gen_cov_mat(c_x, c_y, Matern(length_scale=X_length_scale, nu=X_nu))
else:
    Sigma_X = np.eye(n)

if X_kernel == 'rbf' or X_kernel == 'matern':
    Chol_X = np.linalg.cholesky(Sigma_X)
else:
    Chol_X = np.eye(n)

X = Chol_X @ np.random.randn(n,p)

if X_kernel == 'rbf':
    X_spikes = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
    X_spikes = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
    X_spikes = np.random.randn(n,p)

X_iso = np.random.randn(n,p)

In [8]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, 30, nx)
ys = np.linspace(0, 30, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T

if X_kernel == 'rbf':
    Sigma_X_less = gen_cov_mat(c_x, c_y, RBF(length_scale=X_length_scale))
elif X_kernel == 'matern':
    Sigma_X_less = gen_cov_mat(c_x, c_y, Matern(length_scale=X_length_scale, nu=X_nu))
else:
    Sigma_X_less = np.eye(n)

if X_kernel == 'rbf' or X_kernel == 'matern':
    Chol_X_less = np.linalg.cholesky(Sigma_X_less)
else:
    Chol_X_less = np.eye(n)

X_less = Chol_X_less @ np.random.randn(n,p)

In [9]:
beta = np.zeros(p)
idx = np.random.choice(p,size=s,replace=False)
beta[idx] = np.random.uniform(-1,1,size=s)
# beta[idx] = np.random.uniform(1,3,size=s) * np.random.choice([-1,1],size=s,replace=True)

In [10]:
# tr_idx = create_clus_split(
#             int(np.sqrt(n)), int(np.sqrt(n)), tr_frac
#         )
tr_idx = np.ones(n, dtype=bool)

# Simulate $Y, Y^* \mid X \overset{iid}{\sim} \mathcal{N}(X\beta, I\sigma^2)$

## $X_{\cdot,i}$ independently generated by uniform spikes at locations, then interpolate based on cov matrix

In [11]:
spike_model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X_spikes,
                         beta=beta,
                         coord=coord,
                         Chol_y=None,
                         Chol_ystar=None,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        # risk=True,
                         )
    spike_model_errs.append(errs)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:37<00:00,  2.68it/s]


In [12]:
spike_fig = gen_model_barplots(
    spike_model_errs, 
    model_names, 
    est_names, 
    "Spike",#title, 
    has_elev_err=True,
    err_bars=True,
)
spike_fig.show()

## $X_{i,\cdot} \sim \mathcal{N}(0, I\sigma^2)$

In [13]:
iid_model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X_iso,
                         beta=beta,
                         coord=coord,
                         Chol_y=None,
                         Chol_ystar=None,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        # risk=True,
                         )
    iid_model_errs.append(errs)

100%|██████████| 100/100 [01:06<00:00,  1.51it/s]


In [14]:
iid_fig = gen_model_barplots(
    iid_model_errs, 
    model_names, 
    est_names, 
    "IID ",# + title, 
    has_elev_err=True,
    err_bars=True,
)
iid_fig.show()

## $X_{\cdot,i} \sim \mathcal{N}(0, \Sigma), i=1,\dots,p$

In [15]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                         beta=beta,
                         coord=coord,
                         Chol_y=None,
                         Chol_ystar=None,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        # risk=True,
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [00:52<00:00,  1.90it/s]


In [16]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    "Cov",#title, 
    has_elev_err=True,
    err_bars=True,
)
fig.show()

## $X_{\cdot,i} \sim \mathcal{N}(0, \Sigma'), i=1,\dots,p$ but $\Sigma'$ less covariance than $\Sigma$ 

In [17]:
less_model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X_less,
                         beta=beta,
                         coord=coord,
                         Chol_y=None,
                         Chol_ystar=None,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        # risk=True,
                         )
    less_model_errs.append(errs)

100%|██████████| 100/100 [01:25<00:00,  1.17it/s]


In [18]:
less_fig = gen_model_barplots(
    less_model_errs,
    model_names,
    est_names,
    "Less Cov ",# + title,
    has_elev_err=True,
    err_bars=True,
)
less_fig.show()

## Look at empirical covariances

In [19]:
np.sort(np.corrcoef(X.T))

array([[-0.77010928, -0.75433222, -0.73899265, ...,  0.83403481,
         0.85095821,  1.        ],
       [-0.72975336, -0.69967192, -0.68208492, ...,  0.78415484,
         0.79415019,  1.        ],
       [-0.89801686, -0.89557058, -0.82507915, ...,  0.84227179,
         0.90680111,  1.        ],
       ...,
       [-0.83141649, -0.82522123, -0.80054162, ...,  0.81607414,
         0.83044536,  1.        ],
       [-0.78796966, -0.75548096, -0.74417926, ...,  0.82682352,
         0.85669818,  1.        ],
       [-0.93294142, -0.90140388, -0.85653863, ...,  0.84504564,
         0.90722398,  1.        ]])

In [20]:
np.sort(np.corrcoef(X_less.T))

array([[-0.55327021, -0.51715642, -0.50312813, ...,  0.52527156,
         0.53866103,  1.        ],
       [-0.49202592, -0.46468651, -0.44484744, ...,  0.3533939 ,
         0.47169813,  1.        ],
       [-0.52938166, -0.46403529, -0.44385411, ...,  0.55211203,
         0.55814064,  1.        ],
       ...,
       [-0.64566786, -0.50844897, -0.48336455, ...,  0.49837969,
         0.57023191,  1.        ],
       [-0.40115873, -0.39225632, -0.3790312 , ...,  0.49883071,
         0.50644021,  1.        ],
       [-0.5186391 , -0.51260602, -0.44103756, ...,  0.35242415,
         0.37300464,  1.        ]])

In [21]:
np.sort(np.corrcoef(X_spikes.T))

array([[-0.97307875, -0.96669001, -0.96391574, ...,  0.95567654,
         0.97202192,  1.        ],
       [-0.98892977, -0.97012977, -0.96903044, ...,  0.97677125,
         0.98064846,  1.        ],
       [-0.97915245, -0.95678736, -0.95181149, ...,  0.97373062,
         0.9857968 ,  1.        ],
       ...,
       [-0.98964907, -0.9821244 , -0.95983825, ...,  0.97424183,
         0.98064846,  1.        ],
       [-0.97178881, -0.94760635, -0.93615089, ...,  0.97114264,
         0.97426349,  1.        ],
       [-0.97390979, -0.96995774, -0.96925727, ...,  0.9731727 ,
         0.98221063,  1.        ]])

In [22]:
np.sort(np.corrcoef(X_iso.T))

array([[-0.29586118, -0.25923707, -0.25108579, ...,  0.2688457 ,
         0.43682054,  1.        ],
       [-0.29586118, -0.25622887, -0.24649508, ...,  0.23790455,
         0.30983998,  1.        ],
       [-0.26149606, -0.24972442, -0.2470874 , ...,  0.25364662,
         0.31157507,  1.        ],
       ...,
       [-0.30655264, -0.30193738, -0.26879203, ...,  0.21330896,
         0.29343069,  1.        ],
       [-0.33768987, -0.26510483, -0.26352156, ...,  0.22999334,
         0.23280447,  1.        ],
       [-0.26735428, -0.24337202, -0.20769944, ...,  0.22779793,
         0.24611601,  1.        ]])

# Simulate $\begin{pmatrix} Y \\ Y^* \end{pmatrix} \sim \mathcal{N}\left(\begin{pmatrix} \mu \\ \mu \end{pmatrix}, \begin{pmatrix}\Sigma_Y & \mathrm 0 \\ \mathrm 0 & \Sigma_{Y}  \end{pmatrix}\right)$

In [None]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                         beta=beta,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        risk=True,
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [04:23<00:00,  2.64s/it]


In [None]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title, 
    has_elev_err=True
)
fig.show()

# Simulate $\begin{pmatrix} Y \\ Y^* \end{pmatrix} \sim \mathcal{N}\left(\begin{pmatrix} \mu \\ \mu \end{pmatrix}, \begin{pmatrix}\Sigma_Y & \Sigma_{Y, Y^*} \\ \Sigma_{Y^*, Y} & \Sigma_{Y}  \end{pmatrix}\right)$

In [None]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                         beta=beta,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=Cov_st,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        risk=False,
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [04:22<00:00,  2.63s/it]


In [None]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title, 
    has_elev_err=True
)
fig.show()

In [None]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title, 
    has_elev_err=True
)
fig.show()

In [None]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title, 
    has_elev_err=True
)
fig.show()

# $X$ now from IID Gaussians instead of spatial

In [None]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X_iso,
                         beta=beta,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=Cov_st,
                         tr_idx=tr_idx,
                         fair=False,
                        #  friedman_mu=True,
                        est_sigma=False,
                        risk=True,
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [00:25<00:00,  3.97it/s]


In [None]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title, 
    has_elev_err=True
)
fig.show()