In [None]:
import os
# couldn't do full multiprocessing b/c each notebook was trying to use e.g. 71 threads
os.environ["OMP_NUM_THREADS"] = "4"
os.environ['OPENBLAS_NUM_THREADS'] = "4"

In [None]:
import numpy as np
import time
import datetime
from pathlib import Path
from zipfile import BadZipfile

import multiprocessing as mp

In [None]:
from simulations.distributions import CPDM

In [None]:
rate = 2
entropy = 42
concentration = 1.0
number_samples = 1000
number_simulations = 500
simulation_number = 2

In [None]:
assert simulation_number <= number_simulations
seed_sequence = np.random.SeedSequence(entropy)
seed = seed_sequence.spawn(number_simulations)[simulation_number-1]

In [None]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

yes I am not vectorizing as much as I could here (I had an excuse for the CTPMHg but not now) but this code is easier to write/understand and ensure correctness.

if I were to create production code that I knew/expected people would use downstream (as well as receive any credit or compensation for effort spent improving the code) I would look into better algorithms for vectorizing this.

## CPDM - Compound Poisson Dirichlet Multinomial

In [None]:
def CPDM_simulation(concentration, frequencies, rate, 
                      seed, number_samples):
    rng = np.random.default_rng(seed)

    results = [CPDM(concentration, frequencies, rate, rng) for sample in range(number_samples)]
    
    results = [result.reshape((1,-1)) for result in results]
    results = np.concatenate(results, axis=0)
    # shape of results is now (number_droplets, number_strains)
    return results

In [None]:
prettify = lambda integer: str(integer).zfill(len(str(number_simulations)))

In [None]:
results_filename = 'npzfiles/CPDM_results.{}.npz'.format(prettify(simulation_number))
results_file = Path(results_filename)

# simulation may have already ran successfully on previous attempt
try:
    np.load(results_filename)
except (BadZipfile, FileNotFoundError): # file is corrupted or does not exist
    results_file.unlink(missing_ok=True) # delete corrupted file if it exists
    start_time = time.time()
    results = CPDM_simulation(concentration=concentration,
                    rate=rate, seed=seed, 
                    number_samples=number_samples,
                    frequencies=frequencies)
    runtime = time.time() - start_time
    
    with open('notebook_logs/runtime.{}.log'.format(prettify(simulation_number)), 'a') as file_pointer:
        # https://stackoverflow.com/a/775095/10634604
        runtime_string = str(datetime.timedelta(seconds=runtime))
        file_pointer.write('\nRuntime was {} in Hours:Minutes:Seconds.\n'.format(runtime_string))

    np.savez_compressed(results_filename, droplets=results)
            
    # Maybe this will help prevent memory leaks? 
    # Honestly not sure what happens when using papermill with multiprocessing.
    del(results)