# 10. Running Simulation on Synthetic Datasets

In [55]:
from joblib import Parallel, delayed
from gp_feature_select import GPFeatureSelect
import numpy as np
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split

# testing GPFeatureSelect with a simple dataset

Xtrain = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size500/Rep1.csv').drop(columns='y') 
ytrain = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size500/Rep1.csv')['y']

mod = GPFeatureSelect(model_type = 'lasso_ard')
mod.fit(Xtrain, ytrain)


### Actual Simulation

In [56]:
def run_one_synthetic(unique_name, size, num_rep, model_type, seed):

    meta_path = f'Simulation Datasets/{unique_name}/{unique_name}_meta.json'
    test_path = f'Simulation Datasets/{unique_name}/{unique_name}_test.csv'

    data_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}.csv' 

    X = pd.read_csv(data_path).drop(columns='y') 
    y = pd.read_csv(data_path)['y']

    Xtest = pd.read_csv(test_path).drop(columns='y')
    ytest = pd.read_csv(test_path)['y']

    # open meta data and true beta values
    with open(meta_path, 'r') as f:
        meta = json.load(f)
    beta_true = np.array(meta['beta'])
                

    mod = GPFeatureSelect(model_type=model_type)
    mod.fit(X, y)
    results = mod.get_metrics(X, y, Xtest, ytest, beta_true=beta_true)

    for k, v in results.items():
        if isinstance(v, np.ndarray):
            results[k] = v.tolist()

    # store results in a file in the proper folder
    results_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}_{model_type}results.json'
    # if the directory already exists, skip saving  
    if os.path.exists(results_path):
        print(f"Results file {results_path} already exists, skipping save.")
        return

    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)

def extract_seed(unique_name):
    seed_str = unique_name.split('_')[-1]
    return int(seed_str.replace('seed', ''))



In [61]:
root_dir = 'Simulation Datasets'
unique_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

print(f'Extracting Seed from {unique_names[0]}: {extract_seed(unique_names[0])}')

model_types = ['std', 'ard', 'lasso_std', 'lasso_ard', 'l1_gp']

sizes = [500, 1000] # can also do 50 or 2000
num_reps = 5

Extracting Seed from N11000_AP20_noise1.0_seed5: 5


In [60]:
sim_synthetic = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_synthetic)(unique_name, size, num_rep, model_type, extract_seed(unique_name)) 
    for unique_name in unique_names
    for size in sizes
    for num_rep in range(1, num_reps + 1)
    for model_type in model_types)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed: 42.9min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed: 46.7min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed: 59.1min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed: 89.5min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed: 173.0min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed: 251.3min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed: 429.3min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 446.8min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 463.5min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 472.2min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elap