In [1]:
import numpy as np
import multiprocessing as mp

import traceback
import time

In [2]:
from simulations.global_chi2_cpm import get_expected_probs
from pathlib import Path
from datetime import datetime, timedelta

In [3]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

In [4]:
number_droplets = int(15e6)
rate = 2
monte_carlo_entropy = 42
trials_per_chunk = int(1e7)
number_processes = 40
number_chunks = 100

In [5]:
seed_sequence = np.random.SeedSequence(monte_carlo_entropy)
seeds = seed_sequence.spawn(number_chunks)

# https://stackoverflow.com/a/23590097/10634604
trials_per_process = (trials_per_chunk // number_processes) + (trials_per_chunk % number_processes > 0)

below I do naughty things with global variables and expecting multiprocessing to copy global variable definitions (`frequencies`, `trials_per_process`, `rate`, `number_droplets`) and the like because I don't like having to use `starmap` if I can avoid it.

I am a bad role model/example, do not copy me

In [6]:
def monte_carlo_sim(seed):

    probs_to_flatten = get_expected_probs(frequencies, rate=rate)
    probs = np.array(
        [probs_to_flatten[0]]
        + list(probs_to_flatten[1].ravel()[np.flatnonzero(probs_to_flatten[1])])
        + [probs_to_flatten[2]]
    )

    rng = np.random.default_rng(seed)
    multinomial_trials = rng.multinomial(
        n=number_droplets, pvals=probs, size=trials_per_process
    )

    expected_counts = number_droplets * probs

    monte_carlo_chi2_stats = np.sum(
        ((multinomial_trials - expected_counts) ** 2 / expected_counts), axis=1
    )
    
    return monte_carlo_chi2_stats

`map` has the courtesy to return a list for us, so easy to use as input to `np.concatenate`

In [7]:
prettify = lambda integer: str(integer).zfill(len(str(number_chunks)))

In [8]:
for chunk in range(number_chunks):
    results_filename = 'monte_carlo_results/simulated_chi2.{}.npz'.format(prettify(chunk))  
    results_file = Path(results_filename)

    if results_file.is_file():
        # simulation already ran successfully on previous attempt
        pass
    else:
        subseed = seeds[chunk]
        subseeds = subseed.spawn(number_processes)

        try:
            start_time = time.time()
            pool = mp.Pool(number_processes)
            chi2_stats = pool.map(monte_carlo_sim, subseeds)
            pool.close()
            pool.join()

            chi2_stats = np.concatenate(chi2_stats)
            runtime = time.time() - start_time
            np.savez_compressed(results_filename, chi2_stats=chi2_stats)
            
            # Maybe this will help prevent memory leaks? 
            del(chi2_stats)
            
            with open('monte_carlo_results/logs/runtime.{}.log'.format(prettify(chunk)), 'a') as file_pointer:
                # https://stackoverflow.com/a/775095/10634604
                runtime_string = str(timedelta(seconds=runtime))
                file_pointer.write('Runtime was {} in Hours:Minutes:Seconds.'.format(runtime_string))
            
        except Exception as error_message:
            timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
            with open('monte_carlo_results/logs/{}'.format(timestamp_filename), 'a') as file_pointer:
                file_pointer.write(str(error_message))
                file_pointer.write(traceback.format_exc())            