# 9. Simulation

**Objective:**

Testing and comparison of the performance of various methods of Gaussian process feature selection for sparse datasets. 

**Models:**

1. Standard optimization

2. ARD kernel optimization

3. Lasso feature selection before optimization
    - $\lambda$ selected via cross-validation
4. Lasso feature selection before ARD kernel optimization
    - $\lambda$ selected via cross-validation
5. L1-penalized optimization
    - $\lambda$ selected via cross-validation
    - Includes added thresholding


**Metrics:**

Three metrics will be used to compare the performance of the 5 models. 

- Estimation error of coefficients $$\varepsilon_{\beta} = \|\beta - \hat{\beta}\|_2$$
- Prediction error $$RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}\left(y_i - \hat{y_i}\right)^2}$$
- Computation Time:
    - Total runtime for model fitting, training, and prediction, measured in seconds.

### Testing on one dataset

In [13]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gpflow as gpf
import tensorflow as tf

from sklearn.linear_model import LinearRegression, LassoLarsCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
data_path = 'Simulation Datasets/N100_AP10_noise0.1_seed0/N100_AP10_noise0.1_seed0_data.csv'
meta_path = 'Simulation Datasets/N100_AP10_noise0.1_seed0/N100_AP10_noise0.1_seed0_meta.json'

### Define a class


In [6]:
### Testing LassoLarsCV


dat = pd.read_csv(data_path)
X = dat.drop(columns='y')
y = dat['y']

las = LassoLarsCV(cv = 5)
las.fit(X,y)

In [7]:
las.coef_

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.13742855e-02, -2.05141347e-02, -2.73948651e+00,
        0.00000000e+00, -4.66379933e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        2.32982589e-03,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  3.08660999e-02,
        0.00000000e+00,  0.00000000e+00])

In [None]:
class GPFeatureSelect:

    def __init__(self, model_type = 'std', cv = 5):
        self.model_type = model_type
        self.cv = cv

        self.scaler = None
        self.scaledX = None
        self.selected_features = None
        self.lasso_model = None
        self.beta_hat = None
        self.runtime = None
        self.opt = gpf.optimizers.Scipy()

    def scale_data(self, X):
        self.scaler = StandardScaler()
        self.scaledX = self.scaler.fit_transform(X)
        return self.scaledX


    def cv_lasso_lars(self, X, y):
        if self.model_type in ['lasso_std', 'lasso_ard']:
            las = LassoLarsCV(cv = self.cv)
            las.fit(X, y)
            mask = np.abs(las.coef_) > 1e-6
            self.selected_features = np.where(mask)[0]
            self.beta_hat = las.coef_
            self.lasso_model = las
            return X[:, self.selected_features]
            
        else:
            raise ValueError("cv_lasso_lars called on non-lasso model")
        
    def init_gp_mod(self, X, y):

        m = X.shape[1]
        A_init = tf.zeros((m, 1), dtype=tf.float64)
        b_init = tf.zeros((1,), dtype=tf.float64)

        kernel = gpf.kernels.SquaredExponential(lengthscales=1)
        likelihood = gpf.likelihoods.Gaussian()
        mean_function = gpf.mean_functions.Linear(A=A_init, b = b_init) 

        self.gp_model = gpf.models.GPR(data = (X, y.reshape(-1,1)), kernel = kernel, likelihood = likelihood, mean_function = mean_function)

    def init_ard_gp_mod(self, X, y):

        m = X.shape[1]
        A_init = tf.zeros((m, 1), dtype=tf.float64)
        b_init = tf.zeros((1,), dtype=tf.float64)

        kernel = gpf.kernels.SquaredExponential(lengthscales=np.ones(m))
        likelihood = gpf.likelihoods.Gaussian()
        mean_function = gpf.mean_functions.Linear(A=A_init, b = b_init) 

        self.gp_model = gpf.models.GPR(data = (X, y.reshape(-1,1)), kernel = kernel, likelihood = likelihood, mean_function = mean_function)

    def fit(self, X, y):
        start_time = time.time()

        if self.model_type == 'std':
            self.init_gp_mod(X, y)

        elif self.model_type =='ard':
            self.init_ard_gp_mod(X, y)

        elif self.model_type =='lasso_std':
            reducedX = self.cv_lasso_lars(self.scaledX,y)
            self.init_gp_mod(reducedX, y)
            
        elif self.model_type == 'lasso_ard':
            reducedX = self.cv_lasso_lars(self.scaledX,y)
            self.init_ard_gp_mod(reducedX, y)

        elif self.model_type == 'l1_gp':
            self.init_ard_gp_mod(X, y)
            self.gp_model.training_loss = 


            ### 
            def training_loss_lasso(model, lambda_val = 0):
                base_loss = model.training_loss()
                l1_penalty = lambda_val * tf.reduce_sum(tf.abs(model.mean_function.A))
                total_loss = base_loss + l1_penalty
                return total_loss

            opt = gpf.optimizers.Scipy()
            opt.minimize(
                lambda: training_loss_lasso(mod4, 0.2), 
                mod4.trainable_variables,   
            )

            ###
        else:
            raise ValueError("Unknown model type")
        
  
        
    def predict(self, X):
        if self.selected_features is not None:
            X = X[:, self.selected_features]
            X = scale_data(X)
        return self.gp_model.predict(X)
    
    def get_metrics(self, X, y_true, beta_true=None):
        y_pred = self.predict(X)
        rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
        beta_error = None
        if beta_true is not None and self.coefficients is not None:
            beta_error = np.linalg.norm(beta_true - self.coefficients)
        return {
            'rmse': rmse,
            'beta_error': beta_error,
            'runtime': self.runtime
        }
         