This notebook is for generating data

First, we generate training data

In [4]:
from multiprocessing import Pool
from d2c.data_generation.builder import TSBuilder
N_JOBS = 55
def run_process(params):
    process, n_variables, max_neighborhood_size, noise_std = params
    try:
        tsbuilder = TSBuilder(observations_per_time_series=250, 
                              maxlags=5, 
                              n_variables=n_variables, 
                              time_series_per_process=40, 
                              processes_to_use=[process], 
                              noise_std=noise_std, 
                              max_neighborhood_size=max_neighborhood_size, 
                              seed=42, 
                              max_attempts=200,
                              verbose=True)

        tsbuilder.build()
        tsbuilder.to_pickle(f'./data/P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl')
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} done')
    except ValueError as e:
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} failed: {e}')

if __name__ == '__main__':
    parameters = [(process, n_variables, max_neighborhood_size, noise_std)
                  for process in [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20]
                  for n_variables in [5, 10, 25]
                  for max_neighborhood_size in [2, 4, 8]
                  for noise_std in [0.01, 0.005, 0.001]]

    with Pool(processes=N_JOBS) as pool:
        pool.map(run_process, parameters)


P3_N5_Nj2_n0.01 done

P4_N5_Nj2_n0.01 done
P8_N5_Nj2_n0.01 done
P2_N5_Nj2_n0.01 done
P3_N5_Nj8_n0.01 done
P6_N5_Nj2_n0.01 done
P1_N5_Nj2_n0.01 done
P7_N5_Nj2_n0.01 done
P3_N5_Nj4_n0.01 done
P2_N5_Nj4_n0.01 done
P7_N5_Nj8_n0.01 done
P4_N5_Nj4_n0.01 done
P2_N5_Nj8_n0.01 doneP4_N5_Nj8_n0.01 done

P1_N5_Nj4_n0.01 done
P6_N5_Nj4_n0.01 done
P7_N5_Nj4_n0.01 done
P6_N5_Nj8_n0.01 doneP1_N5_Nj8_n0.01 done

P3_N5_Nj2_n0.005 done
P4_N5_Nj2_n0.005 done
P3_N10_Nj2_n0.01 done
P2_N5_Nj2_n0.005 done
P2_N10_Nj2_n0.01 done
P8_N5_Nj2_n0.005 done
P1_N10_Nj2_n0.01 done
P4_N10_Nj2_n0.01 done
P6_N5_Nj2_n0.005 done
P3_N10_Nj4_n0.01 done
P6_N10_Nj2_n0.01 done
P7_N10_Nj2_n0.01 done
P3_N5_Nj8_n0.005 done
P1_N5_Nj2_n0.005 done
P3_N5_Nj4_n0.005 done
P7_N5_Nj2_n0.005 done
P4_N5_Nj4_n0.005 done
P4_N10_Nj4_n0.01 done
P2_N5_Nj4_n0.005 done
P4_N5_Nj8_n0.005 done
P7_N10_Nj4_n0.01 done
P2_N10_Nj4_n0.01 done
P1_N10_Nj4_n0.01 done
P7_N5_Nj4_n0.005 done
P1_N5_Nj4_n0.005 done
P6_N10_Nj4_n0.01 done
P2_N5_Nj8_n0.005 done
P6_N5_

Let's check any missing combinations

In [7]:
import os 
missing = []
for process in [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20]:
    for n_variables in [5,10,25]:
        for max_neighborhood_size in [2,4,8]:
            for noise_std in [0.01, 0.005, 0.001]:
                filename = f'./data/P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'
                if not os.path.exists(filename):
                    missing.append(filename)


In [8]:
missing

['./data/P6_N25_Nj2_n0.001.pkl',
 './data/P6_N25_Nj4_n0.001.pkl',
 './data/P6_N25_Nj8_n0.001.pkl',
 './data/P8_N25_Nj4_n0.001.pkl',
 './data/P8_N25_Nj8_n0.001.pkl',
 './data/P11_N25_Nj2_n0.001.pkl',
 './data/P11_N25_Nj4_n0.001.pkl',
 './data/P11_N25_Nj8_n0.001.pkl',
 './data/P13_N25_Nj2_n0.001.pkl',
 './data/P13_N25_Nj4_n0.001.pkl',
 './data/P13_N25_Nj8_n0.001.pkl',
 './data/P14_N25_Nj2_n0.001.pkl',
 './data/P14_N25_Nj4_n0.001.pkl',
 './data/P14_N25_Nj8_n0.001.pkl',
 './data/P16_N25_Nj2_n0.001.pkl',
 './data/P16_N25_Nj4_n0.001.pkl',
 './data/P16_N25_Nj8_n0.001.pkl',
 './data/P20_N25_Nj2_n0.001.pkl',
 './data/P20_N25_Nj4_n0.001.pkl',
 './data/P20_N25_Nj8_n0.001.pkl']

Now we focus on recreating the missing ones, by changing the seeds and increasing the number of max_attempts

In [10]:
from multiprocessing import Pool
from d2c.data_generation.builder import TSBuilder
N_JOBS = 55
def run_process(params):
    process, n_variables, max_neighborhood_size, noise_std = params
    try:# we change the seed and increase the max_attempts
        tsbuilder = TSBuilder(observations_per_time_series=250, 
                              maxlags=5, 
                              n_variables=n_variables, 
                              time_series_per_process=40, 
                              processes_to_use=[process], 
                              noise_std=noise_std, 
                              max_neighborhood_size=max_neighborhood_size, 
                              seed=24, 
                              max_attempts=400,
                              verbose=True)

        tsbuilder.build()
        tsbuilder.to_pickle(f'./data/P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl')
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} done')
    except ValueError as e:
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} failed: {e}')

if __name__ == '__main__':
    parameters = []
    for missing_file in missing:
        process = int(missing_file.split('/')[-1].split('_')[0][1:])
        n_variables = int(missing_file.split('/')[-1].split('_')[1][1:])
        max_neighborhood_size = int(missing_file.split('/')[-1].split('_')[2][2:])
        noise_std = float(missing_file.split('/')[-1].split('_')[3][1:-4])
        parameters.append((process, n_variables, max_neighborhood_size, noise_std))


    with Pool(processes=N_JOBS) as pool:
        pool.map(run_process, parameters)

P6_N25_Nj2_n0.001 failed: Failed to generate valid TS for model 6, TS index 1 after 400 attempts. Try again with a different seed.

P16_N25_Nj2_n0.001 failed: Failed to generate valid TS for model 16, TS index 10 after 400 attempts. Try again with a different seed.
P6_N25_Nj8_n0.001 failed: Failed to generate valid TS for model 6, TS index 8 after 400 attempts. Try again with a different seed.
P14_N25_Nj8_n0.001 failed: Failed to generate valid TS for model 14, TS index 17 after 400 attempts. Try again with a different seed.
P16_N25_Nj8_n0.001 failed: Failed to generate valid TS for model 16, TS index 24 after 400 attempts. Try again with a different seed.
P20_N25_Nj2_n0.001 done
P14_N25_Nj2_n0.001 done
P13_N25_Nj8_n0.001 failed: Failed to generate valid TS for model 13, TS index 18 after 400 attempts. Try again with a different seed.
P16_N25_Nj4_n0.001 done
P20_N25_Nj4_n0.001 done
P11_N25_Nj2_n0.001 done
P20_N25_Nj8_n0.001 failed: Failed to generate valid TS for model 20, TS index 33 

Let's check what is still missing

In [11]:
import os 
missing = []
for process in [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20]:
    for n_variables in [5,10,25]:
        for max_neighborhood_size in [2,4,8]:
            for noise_std in [0.01, 0.005, 0.001]:
                filename = f'./data/P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'
                if not os.path.exists(filename):
                    missing.append(filename)


In [12]:
missing

['./data/P6_N25_Nj2_n0.001.pkl',
 './data/P6_N25_Nj4_n0.001.pkl',
 './data/P6_N25_Nj8_n0.001.pkl',
 './data/P13_N25_Nj4_n0.001.pkl',
 './data/P13_N25_Nj8_n0.001.pkl',
 './data/P14_N25_Nj8_n0.001.pkl',
 './data/P16_N25_Nj2_n0.001.pkl',
 './data/P16_N25_Nj8_n0.001.pkl',
 './data/P20_N25_Nj8_n0.001.pkl']

We try one last time with even more max_attemps and a different seed.

In [13]:
from multiprocessing import Pool
from d2c.data_generation.builder import TSBuilder
N_JOBS = len(missing)
def run_process(params):
    process, n_variables, max_neighborhood_size, noise_std = params
    try:# we change the seed and increase the max_attempts
        tsbuilder = TSBuilder(observations_per_time_series=250, 
                              maxlags=5, 
                              n_variables=n_variables, 
                              time_series_per_process=40, 
                              processes_to_use=[process], 
                              noise_std=noise_std, 
                              max_neighborhood_size=max_neighborhood_size, 
                              seed=0, 
                              max_attempts=1000,
                              verbose=True)

        tsbuilder.build()
        tsbuilder.to_pickle(f'./data/P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl')
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} done')
    except ValueError as e:
        print(f'P{process}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std} failed: {e}')

if __name__ == '__main__':
    parameters = []
    for missing_file in missing:
        process = int(missing_file.split('/')[-1].split('_')[0][1:])
        n_variables = int(missing_file.split('/')[-1].split('_')[1][1:])
        max_neighborhood_size = int(missing_file.split('/')[-1].split('_')[2][2:])
        noise_std = float(missing_file.split('/')[-1].split('_')[3][1:-4])
        parameters.append((process, n_variables, max_neighborhood_size, noise_std))


    with Pool(processes=N_JOBS) as pool:
        pool.map(run_process, parameters)

P16_N25_Nj2_n0.001 done

P20_N25_Nj8_n0.001 done
P14_N25_Nj8_n0.001 done
P16_N25_Nj8_n0.001 done
P6_N25_Nj2_n0.001 done
P13_N25_Nj4_n0.001 done
P6_N25_Nj4_n0.001 done
P13_N25_Nj8_n0.001 done
P6_N25_Nj8_n0.001 done


All time series have been generated correctly

In [14]:
len(os.listdir('./data'))

486