# 10. Running Simulation on Synthetic Datasets

In [1]:
from joblib import Parallel, delayed
from gp_feature_select import GPFeatureSelect
import numpy as np
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split

# testing GPFeatureSelect with a simple dataset

Xtrain = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size500/Rep1.csv').drop(columns='y') 
ytrain = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size500/Rep1.csv')['y']

mod = GPFeatureSelect(model_type = 'lasso_ard')
mod.fit(Xtrain, ytrain)


### Actual Simulation

In [12]:
def run_one_synthetic(unique_name, size, num_rep, model_type, seed):

    meta_path = f'Simulation Datasets/{unique_name}/{unique_name}_meta.json'
    test_path = f'Simulation Datasets/{unique_name}/{unique_name}_test.csv'

    data_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}.csv' 

    results_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}_{model_type}results.json'

    if os.path.exists(results_path):
        print(f"Results file {results_path} already exists, skipping computation.")
        return

    print(f"Processing {unique_name}, Size: {size}, Rep: {num_rep}, Model: {model_type}, Seed: {seed}")

    X = pd.read_csv(data_path).drop(columns='y') 
    y = pd.read_csv(data_path)['y']

    Xtest = pd.read_csv(test_path).drop(columns='y')
    ytest = pd.read_csv(test_path)['y']

    # open meta data and true beta values
    with open(meta_path, 'r') as f:
        meta = json.load(f)
    beta_true = np.array(meta['beta'])
                


    mod = GPFeatureSelect(model_type=model_type)
    print('Fitting model')
    mod.fit(X, y)
    print('Getting metrics')
    results = mod.get_metrics(X, y, Xtest, ytest, beta_true=beta_true)

    for k, v in results.items():
        if isinstance(v, np.ndarray):
            results[k] = v.tolist()
    print('Saving results')
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)

def extract_seed(unique_name):
    seed_str = unique_name.split('_')[-1]
    return int(seed_str.replace('seed', ''))



In [13]:
root_dir = 'Simulation Datasets'
unique_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

print(f'Extracting Seed from {unique_names[0]}: {extract_seed(unique_names[0])}')

model_types = ['l1_gp'] #['std', 'ard', 'lasso_std', 'lasso_ard', 'l1_gp']

sizes = [1000] # can also do 50 or 2000
num_reps = 1

Extracting Seed from N11000_AP20_noise1.0_seed5: 5


In [14]:
sim_synthetic = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_synthetic)(unique_name, size, num_rep, model_type, extract_seed(unique_name)) 
    for unique_name in unique_names
    for size in sizes
    for num_rep in range(1, num_reps + 1)
    for model_type in model_types)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


Results file Simulation Datasets/N11000_AP10_noise0.5_seed1/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 1, Model: l1_gp, Seed: 7
Results file Simulation Datasets/N11000_AP10_noise0.1_seed0/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.
Results file Simulation Datasets/N11000_AP20_noise0.1_seed3/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  1.2min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  1.2min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  1.2min remaining:  1.5min


Fitting model
Results file Simulation Datasets/N11000_AP20_noise0.5_seed4/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.
Results file Simulation Datasets/N11000_AP10_noise1.0_seed2/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.
Results file Simulation Datasets/N11000_AP20_noise1.0_seed5/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.


[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  1.2min remaining:   57.2s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  1.2min remaining:   35.8s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  1.2min remaining:   20.5s


Results file Simulation Datasets/N11000_AP50_noise1.0_seed8/Size1000/Rep1_l1_gpresults.json already exists, skipping computation.
Processing N11000_AP50_noise0.1_seed6, Size: 1000, Rep: 1, Model: l1_gp, Seed: 6
Fitting model
Getting metrics
Saving results
Getting metrics
Saving results


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 203.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 203.9min finished
