In [1]:
import numpy as np
import time
import datetime
from pathlib import Path

In [2]:
from simulations.distributions import CPDM

In [3]:
rate = 2
entropy = 42
concentration = 1.0
number_samples = int(15e6)

In [4]:
seed = np.random.SeedSequence(entropy)

In [5]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

yes I am not vectorizing as much as I could here (I had an excuse for the CTPMHg but not now) but this code is easier to write/understand and ensure correctness.

if I were to create production code that I knew/expected people would use downstream (as well as receive any credit or compensation for effort spent improving the code) I would look into better algorithms for vectorizing this.

## CPDM - Compound Poisson Dirichlet-Multinomial (concentration 1)

In [6]:
def CPDM_simulation(concentration, frequencies, rate, 
                      seed, number_samples):
    rng = np.random.default_rng(seed)

    results = [CPDM(concentration, frequencies, rate, rng) for sample in range(number_samples)]
    
    results = [result.reshape((1,-1)) for result in results]
    results = np.concatenate(results, axis=0)
    # shape of results is now (number_droplets, number_strains)
    return results

In [7]:
results_filename = 'CPDM_results.npz'
results_file = Path(results_filename)

if results_file.is_file():
    # simulation already ran successfully on previous attempt
    pass
else:
    start_time = time.time()
    results = CPDM_simulation(concentration=concentration,
                    rate=rate, seed=seed, 
                    number_samples=number_samples,
                    frequencies=frequencies)
    runtime = time.time() - start_time
    
    with open('runtime.log', 'a') as file_pointer:
        # https://stackoverflow.com/a/775095/10634604
        runtime_string = str(datetime.timedelta(seconds=runtime))
        file_pointer.write('\nRuntime was {} in Hours:Minutes:Seconds.\n'.format(runtime_string))

    np.savez_compressed(results_filename, droplets=results)