# 9. Dataset Generation

This simulation will use 27 unique datasets to test the models on, defined by combinations of sparsity, sample size, and noise level. 5-fold cross validation will be used to stabilize results and assess generalization. 

Fixed characteristics:
- Dimensionality: 30 features
- Lengthscale Values: Random, positive, and independently assigned per feature

Variable characteristics:
- Active Proportion: [0.10, 0.20, 0.30]
- Sample Size: [100, 500, 1000]
- i.i.d. Gaussian noise: [0.1, 0.5, 1.0]



In [3]:
import pandas as pd
import numpy as np
import itertools
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

In [1]:
def sim_sparse_lin_mean(n = 100, active_prop = 0.10, noise = 0.1, seed = 22):

    np.random.seed(seed) # set seed
    m = 30 # number of dimensions
    X = np.random.randn(n,m) # generate i.i.d standard normal data

    # construct linear mean
    beta = np.zeros(m)
    active_num = int(np.floor(m * active_prop)) # integer number of active dimensions
    active_indices = np.random.choice(m, size=active_num, replace=False) # randomly choose indices from range of m features
    beta[active_indices] = np.random.uniform(low = -5, high = 5, size = active_num) # assign values
    lin_mean = X @ beta 

    # GP kernel input features
    gp_features = np.random.choice(active_indices, size=min(3, active_num), replace=False) # choose subset of 3 of the linear features
    X_gp = X[:, gp_features]
    sigma = 0.5

    # generate different lengthscales for each GP input feature
    lengthscales = np.random.lognormal(mean=0, sigma=0.5, size=len(gp_features)) # log-normal prior
    X_gp_scaled = X_gp / lengthscales
    sq_norms = np.sum(X_gp_scaled**2, axis=1).reshape(-1, 1)
    dists = sq_norms + sq_norms.T - 2 * (X_gp_scaled @ X_gp_scaled.T)
    K = sigma**2 * np.exp(-0.5 * dists)

    # construct GP noise
    gp_noise = np.random.multivariate_normal(mean = np.zeros(n), cov = K)

    # iid normal noise
    eta = np.random.normal(0, noise, size = n)

    # combined
    y = lin_mean + gp_noise + eta

    return X, y, beta, lengthscales, active_indices, gp_features

In [None]:
param_grid = {
    'n': [100, 500, 1000],
    'active_prop': [0.10, 0.20, 0.30],
    'noise': [0.1, 0.5, 1.0]}

param_combinations = list(itertools.product(
    param_grid['n'],
    param_grid['active_prop'],
    param_grid['noise']
))

In [None]:
output_dir = "Simulation Datasets"