# 10. Running Simulation on Synthetic Datasets

In [25]:
from joblib import Parallel, delayed
from gp_feature_select import GPFeatureSelect
import numpy as np
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split

In [31]:
def run_one_synthetic(unique_name, model_type, seed):

    data_path = f'Simulation Datasets/{unique_name}/{unique_name}_data.csv'
    meta_path = f'Simulation Datasets/{unique_name}/{unique_name}_meta.json'

    data = pd.read_csv(data_path)   
    X = data.drop(columns = 'y')
    y = data['y']
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)

    # open meta data and true beta values
    with open(meta_path, 'r') as f:
        meta = json.load(f)
    beta_true = np.array(meta['beta'])
                

    mod = GPFeatureSelect(model_type=model_type)
    mod.fit(Xtrain, ytrain)
    results = mod.get_metrics(Xtrain, ytrain, Xtest, ytest, beta_true=beta_true)

    for k, v in results.items():
        if isinstance(v, np.ndarray):
            results[k] = v.tolist()

    # store results in a file in the proper folder
    results_path = f'Simulation Datasets/{unique_name}/{unique_name}_{model_type}results.json'

    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)

def extract_seed(unique_name):
    seed_str = unique_name.split('_')[-1]
    return int(seed_str.replace('seed', ''))



In [32]:
root_dir = 'Simulation Datasets'
unique_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

print(f'Extracting Seed from {unique_names[0]}: {extract_seed(unique_names[0])}')

model_types = ['std', 'ard', 'lasso_std', 'lasso_ard', 'l1_gp']


Extracting Seed from N100_AP50_noise1.0_seed44: 44


In [33]:
sim_synthetic = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_synthetic)(unique_name, model_type, extract_seed(unique_name)) 
    for unique_name in unique_names[1:2]
    for model_type in model_types)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.1s finished
