# 10. Running Simulation on Synthetic Datasets

In [1]:
from joblib import Parallel, delayed
from gp_feature_select import GPFeatureSelect
import numpy as np
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split

# testing GPFeatureSelect with a simple dataset

X = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size50/Rep1.csv').drop(columns='y') 
y = pd.read_csv('Simulation Datasets/N11000_AP10_noise0.1_seed0/Size50/Rep1.csv')['y']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

mod = GPFeatureSelect(model_type = 'std')
mod.fit(Xtrain, ytrain)

meta_path = '/Users/liviafingerson/Documents/GitHub/IEMS399-GP/Simulation Datasets/N11000_AP10_noise0.1_seed0/N11000_AP10_noise0.1_seed0_meta.json'
with open(meta_path, 'r') as f:
    meta = json.load(f)
    beta_true = np.array(meta['beta'])


mod.get_metrics(Xtrain, ytrain, Xtest, ytest, beta_true)

### Actual Simulation

In [8]:
def run_one_synthetic(unique_name, size, num_rep, model_type, seed):

    meta_path = f'Simulation Datasets/{unique_name}/{unique_name}_meta.json'
    test_path = f'Simulation Datasets/{unique_name}/{unique_name}_test.csv'

    data_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}.csv' 

    results_path = f'Simulation Datasets/{unique_name}/Size{size}/Rep{num_rep}_{model_type}results.json'

    #if os.path.exists(results_path):
    #    #print(f"Results file {results_path} already exists, skipping computation.")
    #    return

    print(f"Processing {unique_name}, Size: {size}, Rep: {num_rep}, Model: {model_type}, Seed: {seed}")

    X = pd.read_csv(data_path).drop(columns='y') 
    y = pd.read_csv(data_path)['y']

    Xtest = pd.read_csv(test_path).drop(columns='y')
    ytest = pd.read_csv(test_path)['y']

    # open meta data and true beta values
    with open(meta_path, 'r') as f:
        meta = json.load(f)
    beta_true = np.array(meta['beta'])
                
    mod = GPFeatureSelect(model_type=model_type)
    #print('Fitting model')
    mod.fit(X, y)
    #print('Getting metrics')
    results = mod.get_metrics(X, y, Xtest, ytest, beta_true=beta_true)

    for k, v in results.items():
        if isinstance(v, np.ndarray):
            results[k] = v.tolist()
    #print('Saving results')
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)

def extract_seed(unique_name):
    seed_str = unique_name.split('_')[-1]
    return int(seed_str.replace('seed', ''))



In [9]:
root_dir = 'Simulation Datasets'
unique_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

print(f'Extracting Seed from {unique_names[0]}: {extract_seed(unique_names[0])}')

model_types = ['std', 'ard'] #['l1_gp', 'std', 'ard', 'lasso_std', 'lasso_ard', 'l1_gp']

sizes = [1000] #[50, 100, 500, 1000]
num_reps = 5

Extracting Seed from N11000_AP20_noise1.0_seed5: 5


In [7]:
sim_synthetic = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_synthetic)(unique_name, size, num_rep, model_type, extract_seed(unique_name)) 
    for unique_name in unique_names
    for size in sizes
    for num_rep in range(1, num_reps + 1)
    for model_type in model_types)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 4, Model: ard, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 1, Model: ard, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 3, Model: std, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 5, Model: ard, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 2, Model: ard, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 5, Model: std, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 3, Model: ard, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 4, Model: std, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 2, Model: std, Seed: 5
Processing N11000_AP20_noise1.0_seed5, Size: 1000, Rep: 1, Model: std, Seed: 5
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 1, Model: std, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 1, Model: ard, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, R

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   31.2s


Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 3, Model: std, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 3, Model: ard, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 4, Model: std, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 4, Model: ard, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 5, Model: std, Seed: 1
Processing N11000_AP10_noise0.5_seed1, Size: 1000, Rep: 5, Model: ard, Seed: 1
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 1, Model: std, Seed: 3


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   57.1s


Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 1, Model: ard, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 2, Model: std, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 2, Model: ard, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 3, Model: std, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 3, Model: ard, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 4, Model: std, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 4, Model: ard, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 5, Model: std, Seed: 3
Processing N11000_AP20_noise0.1_seed3, Size: 1000, Rep: 5, Model: ard, Seed: 3


[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.6min


Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 1, Model: std, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 1, Model: ard, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 2, Model: std, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 2, Model: ard, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 3, Model: std, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 3, Model: ard, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 4, Model: std, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 4, Model: ard, Seed: 0
Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 5, Model: std, Seed: 0


[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  2.2min


Processing N11000_AP10_noise0.1_seed0, Size: 1000, Rep: 5, Model: ard, Seed: 0
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 1, Model: std, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 1, Model: ard, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 2, Model: std, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 2, Model: ard, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 3, Model: std, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 3, Model: ard, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 4, Model: std, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 4, Model: ard, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 5, Model: std, Seed: 4
Processing N11000_AP20_noise0.5_seed4, Size: 1000, Rep: 5, Model: ard, Seed: 4


[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  2.8min


Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 1, Model: std, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 1, Model: ard, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 2, Model: std, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 2, Model: ard, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 3, Model: std, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 3, Model: ard, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 4, Model: std, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 4, Model: ard, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 5, Model: std, Seed: 8
Processing N11000_AP50_noise1.0_seed8, Size: 1000, Rep: 5, Model: ard, Seed: 8
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 1, Model: std, Seed: 2


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  3.6min


Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 1, Model: ard, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 2, Model: std, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 2, Model: ard, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 3, Model: std, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 3, Model: ard, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 4, Model: std, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 4, Model: ard, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 5, Model: std, Seed: 2
Processing N11000_AP10_noise1.0_seed2, Size: 1000, Rep: 5, Model: ard, Seed: 2
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 1, Model: std, Seed: 7




Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 1, Model: ard, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 2, Model: ard, Seed: 7


[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  4.6min


Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 3, Model: std, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 2, Model: std, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 3, Model: ard, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 4, Model: std, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 4, Model: ard, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 5, Model: std, Seed: 7
Processing N11000_AP50_noise0.5_seed7, Size: 1000, Rep: 5, Model: ard, Seed: 7
