In [1]:
import numpy as np
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.ensemble import RandomForestRegressor

import plotly.express as px

from doc.mse_estimator import ErrorComparer
from doc.data_generation import gen_rbf_X, gen_matern_X, gen_cov_mat
from spe.forest import BlurredForest
from spe.tree import Tree
from spe.estimators import better_test_est_split, cp_bagged_train_test

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
np.random.seed(1)

In [3]:
## number of realizations to run
niter = 100

## data generation parameters
gsize=20
n=20**2
p=30
s=30
delta = 0.75
snr = 0.4
tr_frac = .5

noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = .5

X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5

## ErrorComparer parameters
max_depth = 3
models = [
    [
        RandomForestRegressor(max_depth=max_depth, max_features='sqrt'),
        BlurredForest(max_depth=max_depth, max_features='sqrt'),
        Tree(max_depth=max_depth, max_features='sqrt'),
    ]
]
ests = [
    better_test_est_split,
    better_test_est_split,
    cp_bagged_train_test,
]
est_kwargs = [
    {'alpha': None,
    'full_refit': False},
    {'alpha': None},
    {
        'use_trace_corr': False, 
    },
]

## plot parameters
model_names = ["Blurred Forest"]
est_names = ["NPRF", "PRF", "GenCpPRF"]

In [4]:
err_cmp = ErrorComparer()

In [5]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, gsize, nx)
ys = np.linspace(0, gsize, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T

In [6]:
if noise_kernel == 'rbf':
    Sigma_t = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
    Sigma_t = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
    Sigma_t = np.eye(n)
    
Cov_st = delta*Sigma_t
Sigma_t = delta*Sigma_t + (1-delta)*np.eye(n)

if noise_kernel == 'rbf' or noise_kernel == 'matern':
    Chol_t = np.linalg.cholesky(Sigma_t)
else:
    Chol_t = np.eye(n)

In [7]:
if X_kernel == 'rbf':
    X = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
    X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
    X = np.random.randn(n,p)

beta = np.zeros(p)
idx = np.random.choice(p,size=s,replace=False)
beta[idx] = np.random.uniform(-1,1,size=s)

In [8]:
tr_idx = np.ones(n, dtype=bool)

In [9]:
model_errs = []

for model in models:
    errs = err_cmp.compare(
        model,
        ests,
        est_kwargs,
        niter=niter,
        n=n,
        p=p,
        s=s,
        snr=snr, 
        X=X,
        beta=beta,
        coord=coord,
        Chol_y=Chol_t,
        Chol_ystar=None,
        Cov_y_ystar=None,
        tr_idx=tr_idx,
        fair=False,
    )
    model_errs.append(errs)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [01:16<00:00,  1.30it/s]


In [10]:
from importlib import reload
import doc.plotting_utils
reload(doc.plotting_utils)
from doc.plotting_utils import gen_model_barplots

In [14]:
fig = gen_model_barplots(
    model_errs, 
    model_names, 
    est_names, 
    title="Bagged Models: Spatial Train/Test Split, SSN", 
    has_test_risk=False,
    err_bars=True,
    color_discrete_sequence=px.colors.qualitative.D3[0],
)
fig.show()