In [1]:
import numpy as np
import pandas as pd
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from doc.mse_estimator import ErrorComparer
from doc.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from spe.tree import Tree
from spe.relaxed_lasso import RelaxedLasso
from spe.forest import ParametricBaggingRegressor
from spe.estimators import better_test_est_split, cp_bagged_train_test, simple_train_test_split

In [2]:
np.random.seed(1)

In [3]:
## number of realizations to run
niter = 100

## data generation parameters
n=10**2
p=5
s=5
delta = 0.75
snr = 0.4
tr_frac = .5

noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = .5

X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5

## ErrorComparer parameters
k = 5
max_depth = 3
models = [RelaxedLasso(lambd=.05), Tree(max_depth=max_depth, max_features='sqrt')]
# model_names = ["RelaxedLasso lambda=.05", f"Bagged Depth {max_depth} Decision Trees"]
model_names = ["Relaxed Lasso", "Random Forest"]
ests = [
    better_test_est_split,
    # better_test_est_split,
    cp_bagged_train_test,
    # cp_bagged_train_test2,
    simple_train_test_split, 
]
est_kwargs = [
    {'alpha': None,
    'full_refit': False,
    'bagg': True},
    # {'alpha': 1.,
    # 'full_refit': False,
    # 'bagg': False},
    {'use_trace_corr': False, 
    'full_refit': False},
    # {'use_trace_corr': False, 
    # 'full_refit': False},
    {},
    # {'k': k},
    # {'k': k}
]

## plot parameters
title = "Simulated Model Comparisons"
est_names = ["GenCp", "TrTs"]

## output/save parameters
idx = -1

savedir='~'

In [4]:
# if not os.path.exists(os.path.expanduser(savedir)):
#     os.makedirs(os.path.expanduser(savedir))

# params = pd.DataFrame({'niter': niter,
#                        'n': n, 
#                        'p': p, 
#                        's': s,
#                        'snr': snr,
#                        'n_estimators': n_estimators,
#                        'max_depth': max_depth,
#                        'delta': delta,
#                        'nk': noise_kernel, 
#                        'nls': noise_length_scale, 
#                        'nn': noise_nu, 
#                        'xk': X_kernel,
#                        'xls': X_length_scale,
#                        'xn': X_nu}, index=[idx])
# params.to_csv(os.path.expanduser(savedir + 'params.csv'))
# dffp = os.path.expanduser(savedir + "err_df.csv")
# barfp = os.path.expanduser(savedir + 'barchart.jpeg')

In [5]:
err_cmp = ErrorComparer()

In [6]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, 10, nx)
ys = np.linspace(0, 10, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T

In [7]:
if noise_kernel == 'rbf':
    Sigma_t = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
    Sigma_t = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
    Sigma_t = np.eye(n)

Cov_st = delta*Sigma_t
Sigma_t = delta*Sigma_t + (1-delta)*np.eye(n)

if noise_kernel == 'rbf' or noise_kernel == 'matern':
    Chol_t = np.linalg.cholesky(Sigma_t)
else:
    Chol_t = np.eye(n)

In [8]:
if X_kernel == 'rbf':
    X = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
    X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
    X = np.random.randn(n,p)

# beta = np.zeros(p)
# idx = np.random.choice(p,size=s,replace=False)
# beta[idx] = np.random.uniform(-1,1,size=s)
# beta[idx] = np.random.uniform(1,3,size=s) * np.random.choice([-1,1],size=s,replace=True)

# Spatial 80/20 Split

In [9]:
tr_idx = create_clus_split(
            int(np.sqrt(n)), int(np.sqrt(n)), tr_frac
        )
# tr_idx = np.ones(n, dtype=bool)

# Simulate $Y, Y^* \overset{iid}{\sim} \mathcal{N}(\mu, \Sigma_Y)$

In [10]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                        #  beta=beta,
    #                        X_kernel=X_kernel,
    #                        X_ls = X_length_scale,
    #                        X_nu = X_nu,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                         friedman_mu=True,
                        #    tr_frac=tr_frac,
                        # est_sigma=True,
                        # est_sigma_model=BaggingRegressor(
                        #     base_estimator=model, 
                        #     n_estimators=100
                        # ),
                        # est_sigma_model=RandomForestRegressor(
                        #     max_depth=max_depth, 
                        #     max_features='sqrt', 
                        #     n_estimators=100
                        # ),
                         )
    model_errs.append(errs)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:32<00:00,  3.10it/s]
100%|██████████| 100/100 [00:15<00:00,  6.26it/s]


In [11]:
fig = make_subplots(
    rows=1, cols=len(models),
    subplot_titles=model_names)
#     subplot_titles=("Relaxed Lasso", "Depth 2 Decision Tree"))

for i, errs in enumerate(model_errs):
    risks = [err.mean() for err in errs]
    test_risk = risks[0]
    # elev_test_risk = risks[1]
    # est_risks = risks[2:]
    est_risks = risks[1:]

    df = pd.DataFrame({est_names[i]: errs[i+1] for i in np.arange(len(est_names))})

    fig.add_trace(go.Bar(
    #     x=['VRF', 'BF_WR', 'BF_FR'], 
        x = df.columns,
        y=(df).mean()/test_risk,
        marker_color=px.colors.qualitative.Plotly,
        text=np.around((df).mean()/test_risk,3),
        textposition='outside',
    #     error_y=dict(
    #         type='data',
    #         color='black',
    #         array=(df).std() / test_risk,
    # #         array=(df/test_risk).clip(upper=(df/test_risk).quantile(.95),axis=1).std(),
    # #         symmetric=False,
    # #         array=(df/test_risk).quantile(.95) - (df/test_risk).mean(),
    # #         arrayminus=(df/test_risk).mean() - (df/test_risk).quantile(.93)
    #     )
    #         array=(df).mean() + (df).std(),
    #         arrayminus=(df).mean() - (df).std())
    ), row=1, col=i+1)
    # fig.add_trace(go.Bar(
    #     name='Experimental',
    #     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
    #     error_y=dict(type='data', array=[1, 2])
    # ))
    # fig.update_layout(barmode='group')
    fig.add_hline(y=1., line_color='red', row=1, col=i+1)
    # fig.add_hline(y=elev_test_risk / test_risk, line_color='gray', line_dash='dash', row=1, col=i+1)
    
    fig.update_xaxes(title_text="Method", row=1, col=i+1)
    fig.update_yaxes(title_text="Relative MSE", row=1, col=i+1)
    
# fig.update_layout(
# #     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
#     title=title,
#     xaxis_title="Method",
#     yaxis_title="MSE",
# #     legend_title="Legend Title",
# #     font=dict(
# #         family="Courier New, monospace",
# #         size=18,
# #         color="RebeccaPurple"
# #     )
# )
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.update_layout(title="Bagged Models: Spatial Train/Test Split, No Shared Noise", showlegend=False)
fig.show()

# Simulate $\begin{pmatrix} Y \\ Y^* \end{pmatrix} \sim \mathcal{N}\left(\begin{pmatrix} \mu \\ \mu \end{pmatrix}, \begin{pmatrix}\Sigma_Y & \Sigma_{Y, Y^*} \\ \Sigma_{Y^*, Y} & \Sigma_{Y}  \end{pmatrix}\right)$

In [12]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                        #  beta=beta,
    #                        X_kernel=X_kernel,
    #                        X_ls = X_length_scale,
    #                        X_nu = X_nu,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=Cov_st,
                         tr_idx=tr_idx,
                         fair=False,
                         friedman_mu=True,
                        #    tr_frac=tr_frac,
                        # est_sigma=True,
                        # est_sigma_model=BaggingRegressor(
                        #     base_estimator=model, 
                        #     n_estimators=100
                        # ),
                        # est_sigma_model=RandomForestRegressor(
                        #     max_depth=max_depth, 
                        #     max_features='sqrt', 
                        #     n_estimators=100
                        # ),
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [00:35<00:00,  2.78it/s]
100%|██████████| 100/100 [00:17<00:00,  5.83it/s]


In [13]:
fig = make_subplots(
    rows=1, cols=len(models),
    subplot_titles=model_names)
#     subplot_titles=("Relaxed Lasso", "Depth 2 Decision Tree"))

for i, errs in enumerate(model_errs):
    risks = [err.mean() for err in errs]
    test_risk = risks[0]
    # elev_test_risk = risks[1]
    # est_risks = risks[2:]
    est_risks = risks[1:]

    df = pd.DataFrame({est_names[i]: errs[i+1] for i in np.arange(len(est_names))})

    fig.add_trace(go.Bar(
    #     x=['VRF', 'BF_WR', 'BF_FR'], 
        x = df.columns,
        y=(df).mean()/test_risk,
        marker_color=px.colors.qualitative.Plotly,
        text=np.around((df).mean()/test_risk,3),
        textposition='outside',
    #     error_y=dict(
    #         type='data',
    #         color='black',
    #         array=(df).std() / test_risk,
    # #         array=(df/test_risk).clip(upper=(df/test_risk).quantile(.95),axis=1).std(),
    # #         symmetric=False,
    # #         array=(df/test_risk).quantile(.95) - (df/test_risk).mean(),
    # #         arrayminus=(df/test_risk).mean() - (df/test_risk).quantile(.93)
    #     )
    #         array=(df).mean() + (df).std(),
    #         arrayminus=(df).mean() - (df).std())
    ), row=1, col=i+1)
    # fig.add_trace(go.Bar(
    #     name='Experimental',
    #     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
    #     error_y=dict(type='data', array=[1, 2])
    # ))
    # fig.update_layout(barmode='group')
    fig.add_hline(y=1., line_color='red', row=1, col=i+1)
    # fig.add_hline(y=elev_test_risk / test_risk, line_color='gray', line_dash='dash', row=1, col=i+1)
    
    fig.update_xaxes(title_text="Method", row=1, col=i+1)
    fig.update_yaxes(title_text="Relative MSE", row=1, col=i+1)
    
# fig.update_layout(
# #     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
#     title=title,
#     xaxis_title="Method",
#     yaxis_title="MSE",
# #     legend_title="Legend Title",
# #     font=dict(
# #         family="Courier New, monospace",
# #         size=18,
# #         color="RebeccaPurple"
# #     )
# )
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.update_layout(title="Bagged Models: Spatial Train/Test Split, Shared Structured Noise", showlegend=False)
fig.show()

# Random 80/20 Split

In [14]:
tr_idx = np.concatenate((np.ones(int(.8*n)), np.zeros(n - int(.8*n)))).astype(bool)
np.random.shuffle(tr_idx)

# Simulate $Y, Y^* \overset{iid}{\sim} \mathcal{N}(\mu, \Sigma_Y)$

In [15]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                        #  beta=beta,
    #                        X_kernel=X_kernel,
    #                        X_ls = X_length_scale,
    #                        X_nu = X_nu,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=None,
                         tr_idx=tr_idx,
                         fair=False,
                         friedman_mu=True,
                        #    tr_frac=tr_frac,
                        # est_sigma=True,
                        # est_sigma_model=BaggingRegressor(
                        #     base_estimator=model, 
                        #     n_estimators=100
                        # ),
                        # est_sigma_model=RandomForestRegressor(
                        #     max_depth=max_depth, 
                        #     max_features='sqrt', 
                        #     n_estimators=100
                        # ),
                         )
    model_errs.append(errs)

  0%|          | 0/100 [00:00<?, ?it/s]


IndexError: arrays used as indices must be of integer (or boolean) type

In [None]:
fig = make_subplots(
    rows=1, cols=len(models),
    subplot_titles=model_names)
#     subplot_titles=("Relaxed Lasso", "Depth 2 Decision Tree"))

for i, errs in enumerate(model_errs):
    risks = [err.mean() for err in errs]
    test_risk = risks[0]
    # elev_test_risk = risks[1]
    # est_risks = risks[2:]
    est_risks = risks[1:]

    df = pd.DataFrame({est_names[i]: errs[i+1] for i in np.arange(len(est_names))})

    fig.add_trace(go.Bar(
    #     x=['VRF', 'BF_WR', 'BF_FR'], 
        x = df.columns,
        y=(df).mean()/test_risk,
        marker_color=px.colors.qualitative.Plotly,
        text=np.around((df).mean()/test_risk,3),
        textposition='outside',
    #     error_y=dict(
    #         type='data',
    #         color='black',
    #         array=(df).std() / test_risk,
    # #         array=(df/test_risk).clip(upper=(df/test_risk).quantile(.95),axis=1).std(),
    # #         symmetric=False,
    # #         array=(df/test_risk).quantile(.95) - (df/test_risk).mean(),
    # #         arrayminus=(df/test_risk).mean() - (df/test_risk).quantile(.93)
    #     )
    #         array=(df).mean() + (df).std(),
    #         arrayminus=(df).mean() - (df).std())
    ), row=1, col=i+1)
    # fig.add_trace(go.Bar(
    #     name='Experimental',
    #     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
    #     error_y=dict(type='data', array=[1, 2])
    # ))
    # fig.update_layout(barmode='group')
    fig.add_hline(y=1., line_color='red', row=1, col=i+1)
    # fig.add_hline(y=elev_test_risk / test_risk, line_color='gray', line_dash='dash', row=1, col=i+1)
    
    fig.update_xaxes(title_text="Method", row=1, col=i+1)
    fig.update_yaxes(title_text="Relative MSE", row=1, col=i+1)
    
# fig.update_layout(
# #     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
#     title=title,
#     xaxis_title="Method",
#     yaxis_title="MSE",
# #     legend_title="Legend Title",
# #     font=dict(
# #         family="Courier New, monospace",
# #         size=18,
# #         color="RebeccaPurple"
# #     )
# )
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.update_layout(title="Bagged Models: Train/Test Split, No Shared Noise", showlegend=False)
fig.show()

# Simulate $\begin{pmatrix} Y \\ Y^* \end{pmatrix} \sim \mathcal{N}\left(\begin{pmatrix} \mu \\ \mu \end{pmatrix}, \begin{pmatrix}\Sigma_Y & \Sigma_{Y, Y^*} \\ \Sigma_{Y^*, Y} & \Sigma_{Y}  \end{pmatrix}\right)$

In [None]:
model_errs = []

for model in models:
    errs = err_cmp.compare(model,
                         ests,
                         est_kwargs,
                         niter=niter,
                         n=n,
                         p=p,
                         s=s,
                         snr=snr, 
                         X=X,
                        #  beta=beta,
    #                        X_kernel=X_kernel,
    #                        X_ls = X_length_scale,
    #                        X_nu = X_nu,
                         coord=coord,
                         Chol_y=Chol_t,
                         Chol_ystar=Chol_t,
                         Cov_y_ystar=Cov_st,
                         tr_idx=tr_idx,
                         fair=False,
                         friedman_mu=True,
                        #    tr_frac=tr_frac,
                        # est_sigma=True,
                        # est_sigma_model=BaggingRegressor(
                        #     base_estimator=model, 
                        #     n_estimators=100
                        # ),
                        # est_sigma_model=RandomForestRegressor(
                        #     max_depth=max_depth, 
                        #     max_features='sqrt', 
                        #     n_estimators=100
                        # ),
                         )
    model_errs.append(errs)

100%|██████████| 100/100 [00:33<00:00,  3.00it/s]
100%|██████████| 100/100 [00:14<00:00,  7.11it/s]


In [None]:
fig = make_subplots(
    rows=1, cols=len(models),
    subplot_titles=model_names)
#     subplot_titles=("Relaxed Lasso", "Depth 2 Decision Tree"))

for i, errs in enumerate(model_errs):
    risks = [err.mean() for err in errs]
    test_risk = risks[0]
    # elev_test_risk = risks[1]
    # est_risks = risks[2:]
    est_risks = risks[1:]

    df = pd.DataFrame({est_names[i]: errs[i+1] for i in np.arange(len(est_names))})

    fig.add_trace(go.Bar(
    #     x=['VRF', 'BF_WR', 'BF_FR'], 
        x = df.columns,
        y=(df).mean()/test_risk,
        marker_color=px.colors.qualitative.Plotly,
        text=np.around((df).mean()/test_risk,3),
        textposition='outside',
    #     error_y=dict(
    #         type='data',
    #         color='black',
    #         array=(df).std() / test_risk,
    # #         array=(df/test_risk).clip(upper=(df/test_risk).quantile(.95),axis=1).std(),
    # #         symmetric=False,
    # #         array=(df/test_risk).quantile(.95) - (df/test_risk).mean(),
    # #         arrayminus=(df/test_risk).mean() - (df/test_risk).quantile(.93)
    #     )
    #         array=(df).mean() + (df).std(),
    #         arrayminus=(df).mean() - (df).std())
    ), row=1, col=i+1)
    # fig.add_trace(go.Bar(
    #     name='Experimental',
    #     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
    #     error_y=dict(type='data', array=[1, 2])
    # ))
    # fig.update_layout(barmode='group')
    fig.add_hline(y=1., line_color='red', row=1, col=i+1)
    # fig.add_hline(y=elev_test_risk / test_risk, line_color='gray', line_dash='dash', row=1, col=i+1)
    
    fig.update_xaxes(title_text="Method", row=1, col=i+1)
    fig.update_yaxes(title_text="Relative MSE", row=1, col=i+1)
    
# fig.update_layout(
# #     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
#     title=title,
#     xaxis_title="Method",
#     yaxis_title="MSE",
# #     legend_title="Legend Title",
# #     font=dict(
# #         family="Courier New, monospace",
# #         size=18,
# #         color="RebeccaPurple"
# #     )
# )
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.update_layout(title="Bagged Models: Train/Test Split, Shared Structured Noise", showlegend=False)
fig.show()