In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.ensemble import RandomForestRegressor

import plotly.express as px

import plotly.graph_objects as go

from scipy.linalg import toeplitz, block_diag
from scipy.spatial.distance import squareform

import skgstat as skg

from spe.mse_estimator import ErrorComparer
from spe.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from spe.forest import BlurredForest
from spe.estimators import kfoldcv, kmeanscv, better_test_est_split, cp_rf_train_test, cp_general_train_test, bag_kfoldcv, bag_kmeanscv

import os

In [2]:
niter = 50

n=30**2
p=30
s=30

delta = 1.

n_estimators = 100
max_depth = 6

snr = 0.4

noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = 0.5

X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5

idx = -1

savedir='~'

In [3]:
# noise_kernel = kernel
# X_kernel = kernel

# noise_length_scale = length_scale
# X_length_scale = length_scale

# noise_nu = nu
# X_nu = nu

In [4]:
if not os.path.exists(os.path.expanduser(savedir)):
    os.makedirs(os.path.expanduser(savedir))

params = pd.DataFrame({'niter': niter,
                       'n': n, 
                       'p': p, 
                       's': s,
                       'snr': snr,
                       'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'delta': delta,
                       'nk': noise_kernel, 
                       'nls': noise_length_scale, 
                       'nn': noise_nu, 
                       'xk': X_kernel,
                       'xls': X_length_scale,
                       'xn': X_nu}, index=[idx])
params.to_csv(os.path.expanduser(savedir + 'params.csv'))
dffp = os.path.expanduser(savedir + "err_df.csv")
# barfp = os.path.expanduser(savedir + 'barchart.jpeg')

In [5]:
err_cmp = ErrorComparer()

In [6]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, 10, nx)
ys = np.linspace(0, 10, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T

In [7]:
if noise_kernel == 'rbf':
    Sigma_t = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
    Sigma_t = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
    Sigma_t = np.eye(n)
    
Sigma_t = delta*Sigma_t + (1-delta)*np.eye(n)

if noise_kernel == 'rbf' or noise_kernel == 'matern':
    Chol_t = np.linalg.cholesky(Sigma_t)
else:
    Chol_t = np.eye(n)

In [8]:
if X_kernel == 'rbf':
    X = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
    X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
    X = np.random.randn(n,p)

beta = np.zeros(p)
idx = np.random.choice(p,size=s)
beta[idx] = np.random.uniform(-1,1,size=s)

## One compare run for test_err, oracle_err

In [13]:
models = BlurredForest(n_estimators=n_estimators,  
                       max_depth=max_depth, 
                       bootstrap_type='blur')

ests = [
    better_test_est_split,
    cp_rf_train_test
]

est_kwargs = [
    {'full_refit':True,
    'chol_eps': Chol_t},
    {'full_refit':True,
    'chol_eps': Chol_t},
]

In [14]:
(test_err,
 bforc_err) = err_cmp.compare(models,
                     ests,
                     est_kwargs,
                     niter=niter,
                     n=n,
                     p=p,
                     s=p,
                     snr=snr, 
                     X=X,
                     beta=beta,
                     coord=coord,
                     Chol_t=Chol_t,
                     Chol_s=None,
                     tr_idx=None,
                     fair=False,
                     )

0
0.5833333333333334
10
20
30
40


## Another compare for estimating covariance

In [12]:
# c_x.shape, c_y.shape, coord.shape

In [9]:
# ## estimate residuals with vanilla fit
# # rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
# # rf.fit(X, y)
# # resids = y - rf.predict()

# vals = Chol_t @ np.random.randn(n)

In [10]:
# V = skg.Variogram(coord, vals, model='matern')
# # V.plot()

In [15]:
# V.parameters

In [11]:
# # K0 = vals.var()
# K0 = V.parameters[1]
# K0

0.7283070275905974

In [17]:
# (np.diag(Sigma_t) == 1).mean()

In [12]:
# from scipy.spatial import distance_matrix

In [14]:
# distance_matrix(coord, coord)

array([[ 0.        ,  0.34482759,  0.68965517, ..., 13.66318121,
        13.90044439, 14.14213562],
       [ 0.34482759,  0.        ,  0.34482759, ..., 13.43058075,
        13.66318121, 13.90044439],
       [ 0.68965517,  0.34482759,  0.        , ..., 13.20288946,
        13.43058075, 13.66318121],
       ...,
       [13.66318121, 13.43058075, 13.20288946, ...,  0.        ,
         0.34482759,  0.68965517],
       [13.90044439, 13.66318121, 13.43058075, ...,  0.34482759,
         0.        ,  0.34482759],
       [14.14213562, 13.90044439, 13.66318121, ...,  0.68965517,
         0.34482759,  0.        ]])

In [16]:
# V.distance

array([0.34482759, 0.68965517, 1.03448276, ..., 0.34482759, 0.68965517,
       0.34482759])

In [18]:
## from variogram to covariance matrix
## \gamma(h) = K(0) - K(h)
# fitted_vm = V.fitted_model
# # semivar = squareform(fitted_vm(V.distance))
# semivar = fitted_vm(distance_matrix(coord, coord).flatten()).reshape((n,n))

In [19]:
# np.all(semivar==squareform(fitted_vm(V.distance)))

True

In [19]:
# est_Sigma_t = K0*np.ones((n,n)) - semivar
# # cov_mat = 1 - semivar

In [20]:
# est_Sigma_t

In [21]:
# Sigma_t

In [22]:
# est_Chol_t = np.linalg.cholesky(est_Sigma_t)

In [23]:
# def est_Sigma(X_est, y_est, c_x_est, c_y_est):
#     ## compute empirical variogram
    
#     ## fit variogram model to empirics
    
#     ## make covariance matrix
    
#     ## return model, cov_mat
#     pass

In [11]:
# from importlib import reload
# import spe
# reload(spe)

# from spe.mse_estimator import ErrorComparer
# from spe.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
# from spe.forest import BlurredForest
# from spe.estimators import kfoldcv, kmeanscv, better_test_est_split, cp_rf_train_test, cp_general_train_test, bag_kfoldcv, bag_kmeanscv


In [12]:
# err_cmp = ErrorComparer()

In [11]:
models = BlurredForest(n_estimators=n_estimators,  
                       max_depth=max_depth, 
                       bootstrap_type='blur')

ests = [
    cp_rf_train_test
]

est_kwargs = [
    {'full_refit':True,
    'chol_eps': None,}
]

In [12]:
bfest_err = err_cmp.compare(
    models,
    ests,
    est_kwargs,
    niter=niter,
    n=n,
    p=p,
    s=p,
    snr=snr, 
    X=X,
    beta=beta,
    coord=coord,
    Chol_t=Chol_t,
    Chol_s=None,
    tr_idx=None,
    fair=False,
    est_sigma=True,
    est_sigma_model=RandomForestRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth
    )
)

0
0.5833333333333334
new
new
new
new
new
new
new
new
new
new
10
new
new
new
new
new
new
new
new
new
new
20
new
new
new
new
new
new
new
new
new
new
30
new
new
new
new
new
new
new
new
new
new
40
new
new
new
new
new
new
new
new
new
new


In [37]:
%debug

> [0;32m/Users/kevinfry/opt/anaconda3/envs/spe/lib/python3.10/site-packages/skgstat/Variogram.py[0m(481)[0;36mset_values[0;34m()[0m
[0;32m    479 [0;31m        [0;31m# check dimensions[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    480 [0;31m        [0;32mif[0m [0;32mnot[0m [0mlen[0m[0;34m([0m[0mvalues[0m[0;34m)[0m [0;34m==[0m [0mlen[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mcoordinates[0m[0;34m)[0m[0;34m:[0m  [0;31m# pragma: no cover[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 481 [0;31m            raise ValueError('The length of the values array has to match' +
[0m[0;32m    482 [0;31m                             'the length of coordinates')
[0m[0;32m    483 [0;31m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m/Users/kevinfry/opt/anaconda3/envs/spe/lib/python3.10/site-packages/skgstat/Variogram.py[0m(295)[0;36m__init__[0;34m()[0m
[0;32m    293 [0;31m        [0mself[0m[0;34m.[0m[0m_values[0m [0;34m=[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m


In [18]:
bfest_err = bfest_err[0]

In [19]:
risk_test = test_err.mean()
risk_bforc = bforc_err.mean()
risk_bfest = bfest_err.mean()


In [20]:
save_df = pd.DataFrame({
    'Test_Err': (risk_test.T),
    'BF_ORC': (bforc_err.T),
    'BF_EST': (bfest_err.T),
})
# save_df.to_csv(dffp)

In [21]:
df = pd.DataFrame({
    'BF_ORC': (bforc_err.T),
    'BF_EST': (bfest_err.T),
})

In [22]:
(df / risk_test).mean()

BF_ORC    0.989657
BF_EST    1.060672
dtype: float64

In [23]:
df.var()

BF_ORC    0.003355
BF_EST    0.019491
dtype: float64

In [25]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df.columns, 
    y=(df / risk_test).mean(),
    marker_color=px.colors.qualitative.Plotly,
    text=np.around((df / risk_test).mean(),3),
    textposition='outside',
    error_y=dict(
        type='data',
        color='black',
        symmetric=False,
        array=(df).quantile(.75) - (df).mean(),
        arrayminus=(df).mean() - (df).quantile(.25))
#         array=(df).mean() + (df).std(),
#         arrayminus=(df).mean() - (df).std())
))
# fig.add_trace(go.Bar(
#     name='Experimental',
#     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
#     error_y=dict(type='data', array=[1, 2])
# ))
# fig.update_layout(barmode='group')
# fig.add_hline(y=1., line_color='red')
fig.update_layout(
#     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
    title=f"Blurred Forest: Oracle vs Estimated Covariance",
    xaxis_title="Method",
    yaxis_title="MSE",
#     legend_title="Legend Title",
#     font=dict(
#         family="Courier New, monospace",
#         size=18,
#         color="RebeccaPurple"
#     )
)
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.show()