In [1]:
from datetime import datetime
import traceback
import numpy as np
from pathlib import Path
from glob import glob
import multiprocessing as mp

from zipfile import BadZipfile

from analysis_utils.generate_unconditional_pairwise_hypothesis_test_results import get_unconditional_pairwise_results

In [2]:
simulation_basename = 'CECNBDM'
number_simulations = 500

Check to see whether all filenames actually exist already and are what we expect

In [3]:
prettify = lambda integer: str(integer).zfill(len(str(number_simulations)))
filenames = sorted(list(glob('npzfiles/*.npz')))[0:number_simulations]

simulation_numbers = list(range(1, number_simulations+1))
expected_filenames = ['npzfiles/{}_results.{}.npz'.format(simulation_basename, prettify(simulation_number))
                      for simulation_number in simulation_numbers]

try:
    assert filenames == expected_filenames
except AssertionError as error_message:
    timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
    with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
        file_pointer.write('The expected filenames did not match the actual filenames.')
        file_pointer.write(' Some files may be missing, or something else may be wrong\n')
        file_pointer.write(str(error_message))
        file_pointer.write(traceback.format_exc())
    raise error_message

make sure all files are actually valid and not corrupted

In [4]:
for filename in filenames:
    try:
        np.load(filename) # np.load is quick, even if accessing 'attribute' of npzfile can be slow
    except (FileNotFoundError, BadZipfile) as error_message:
        timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
        with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
            file_pointer.write('Problem with file `{}`.'.format(filename))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())  
        raise error_message

Actually generate the results -- this will be slow

In [5]:
parallel_input = [[filename, simulation_basename, number_simulations]
                 for filename in filenames]

In [6]:
%%time
with mp.get_context("spawn").Pool(48) as pool:
    pool.starmap(get_unconditional_pairwise_results, parallel_input, chunksize=1)
    pool.close()
    pool.join()

CPU times: user 8.14 s, sys: 23.2 s, total: 31.3 s
Wall time: 15h 6min 13s


Combine results into one file

In [7]:
results_filenames = sorted(list(glob('unconditional_pairwise_results/{}_*.npz'.format(simulation_basename))))

make sure combined results actually completed correctly

In [8]:
for results_filename in results_filenames:
    try:
        npzfile = np.load(results_filename)
        # don't care about the order, list comparison can fail based on order
        assert set(npzfile.files) == {'divergences', 'pvals', 'difference_vectors'}
    except (FileNotFoundError, BadZipfile) as error_message:
        results_file = Path(results_filename)
        results_file.unlink(missing_ok=True) # delete corrupted file if it exists
        timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
        with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
            file_pointer.write('Problem with file `{}`.'.format(results_filename))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())   
        raise error_message

get the correct dimensions

In [9]:
test_npzfile = np.load(results_filenames[0])
divergence_shape = test_npzfile['divergences'].shape
pvals_shape = test_npzfile['pvals'].shape
difference_vectors_shape = test_npzfile['difference_vectors'].shape

divergences = np.zeros((*divergence_shape, number_simulations))
pvals = np.zeros((*pvals_shape, number_simulations))
difference_vectors = np.zeros((*difference_vectors_shape, number_simulations))

open and load them all and add their values

In [10]:
for counter, results_filename in enumerate(results_filenames):
    npzfile = np.load(results_filename)
    
    divergences[..., counter] = npzfile['divergences']
    pvals[..., counter] = npzfile['pvals']
    difference_vectors[..., counter] = npzfile['difference_vectors']
    
all_results_filename = 'unconditional_pairwise_results/all_results.npz'
all_results_file = Path(all_results_filename)
# if somehow `unconditional_pairwise_results` directory got deleted but everything was somehow still saved in memory?
all_results_file.parent.mkdir(parents=True, exist_ok=True)
np.savez_compressed(all_results_filename, divergences=divergences,
                   pvals=pvals, difference_vectors=difference_vectors)