In [1]:
import numpy as np
import multiprocessing as mp

import traceback

from datetime import datetime
from glob import glob
from simulations.global_chi2_cpm import get_differences_from_expected

In [2]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

You can't pickle a lambda for some reason that has the variables inside of it, but you can do the following and pass it to multiprocessing... ugh ew. Again for anyone who ever reads this, I am lazy and don't copy this -- making too many assumptions about state (e.g. by attempting to transfer global variables) is dangerous and usually a very bad no good idea, and now is probably not an exception to that rule of thumb either.

In [3]:
def abuse_globals(filename):
    npzfile = np.load(filename)
    batch = npzfile['droplets']
    try:
        assert batch.shape[1] == frequencies.shape[0]
    except AssertionError:
        batch = batch.T
        assert batch.shape[1] == frequencies.shape[0]
    results = get_differences_from_expected(batch, frequencies, rate=2)
    del(batch) # Maybe help with memory management???
    return results

In [4]:
filenames = sorted(list(glob('simulation_results/npzfiles/*.npz')))
# make sure all files are actually valid and not corrupted
for filename in filenames:
    try:
        np.load(filename) # np.load is quick, even if accessing 'attribute' of npzfile can be slow
    except Exception as error_message:
        timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
        with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
            file_pointer.write('Problem with file `{}`.'.format(filename))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())        

We are going to open/load the results from each of the simulations, compute the observed and expected numbers of droplets belonging to picky groups (treatments and controls) and the differences of the observed from the expected.

In [5]:
%%time
try:
    pool = mp.Pool(40)
    results = pool.map(abuse_globals, filenames, chunksize=1)
    pool.close()
    pool.join()
except Exception as error_message:
    timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
    with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
        file_pointer.write(str(error_message))
        file_pointer.write(traceback.format_exc())

CPU times: user 2.41 s, sys: 6.9 s, total: 9.3 s
Wall time: 3h 30min 45s


Then we save the results to disk.

In [6]:
empty_droplet_diffs, picky_group_diffs, multi_strain_droplet_diffs = list(zip(*results))
empty_droplet_diffs = np.asarray(empty_droplet_diffs)
picky_group_diffs = np.asarray(picky_group_diffs)
multi_strain_droplet_diffs = np.asarray(multi_strain_droplet_diffs)

np.savez_compressed('picky_observed_differences_from_expected.npz',
empty_droplet_diffs=empty_droplet_diffs,
picky_group_diffs=picky_group_diffs,
multi_strain_droplet_diffs=multi_strain_droplet_diffs                    
)