# 10. Running Simulation on Synthetic Datasets

In [20]:
from joblib import Parallel, delayed
from gp_feature_select import GPFeatureSelect
import numpy as np
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split

In [21]:
def run_one_synthetic(unique_name, model_type, seed):

    data_path = f'Simulation Datasets/{unique_name}/{unique_name}_data.csv'
    meta_path = f'Simulation Datasets/{unique_name}/{unique_name}_meta.json'

    data = pd.read_csv(data_path)   
    X = data.drop(columns = 'y')
    y = data['y']
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)

    # open meta data and true beta values
    with open(meta_path, 'r') as f:
        meta = json.load(f)
    beta_true = np.array(meta['beta'])
                

    mod = GPFeatureSelect(model_type=model_type)
    mod.fit(Xtrain, ytrain)
    results = mod.get_metrics(Xtrain, ytrain, Xtest, ytest, beta_true=beta_true)

    # store results in a file in the proper folder
    results_path = f'Simulation Datasets/{unique_name}/{unique_name}_{model_type}results.json'

    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)

    results_df = pd.DataFrame([results])


def extract_seed(unique_name):
    seed_str = unique_name.split('_')[-1]
    return int(seed_str.replace('seed', ''))



In [22]:
root_dir = 'Simulation Datasets'
unique_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

print(f'Extracting Seed from {unique_names[0]}: {extract_seed(unique_names[0])}')

model_types = ['std', 'ard', 'lasso_std', 'lasso_ard', 'l1_gp']


Extracting Seed from N100_AP50_noise1.0_seed44: 44


In [23]:
sim_synthetic = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_synthetic)(unique_name, model_type, extract_seed(unique_name)) 
    for unique_name in unique_names[1:10]
    for model_type in model_types)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed: 15.8min


No features selected after thresholding


  precision = tp / (tp + fp)


No features selected after thresholding


  precision = tp / (tp + fp)
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed: 27.6min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed: 35.5min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed: 45.2min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 51.4min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 56.0min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 58.3min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done 222 tasks      | elapsed: 96.1min
[Parallel(n_jobs=-1)]: Done 245 tasks      | elapsed: 101.4min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed: 133.6min
[Parallel(n_jobs=-1)]: Done 318 tasks      | elapsed: 149.8min
[Parallel(n_jobs=-1)]: Done 345 tasks      | elapsed: 169.0min
[Parallel(n_jobs=-1)]: Done 372 tasks      | elapsed: 180.9min


No features selected after thresholding


  precision = tp / (tp + fp)
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed: 201.1min
[Parallel(n_jobs=-1)]: Done 430 tasks      | elapsed: 211.9min


No features selected after thresholding


  precision = tp / (tp + fp)
[Parallel(n_jobs=-1)]: Done 461 tasks      | elapsed: 220.9min
[Parallel(n_jobs=-1)]: Done 492 tasks      | elapsed: 232.1min
[Parallel(n_jobs=-1)]: Done 525 tasks      | elapsed: 260.6min
[Parallel(n_jobs=-1)]: Done 558 tasks      | elapsed: 294.0min
[Parallel(n_jobs=-1)]: Done 593 tasks      | elapsed: 315.1min
[Parallel(n_jobs=-1)]: Done 628 tasks      | elapsed: 347.7min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed: 382.1min finished
