In [1]:
import numpy as np
import multiprocessing as mp

import traceback

from datetime import datetime
from glob import glob
from simulations.global_chi2_cpm import get_chi_squared_statistic

In [2]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

You can't pickle a lambda for some reason that has the variables inside of it, but you can do the following and pass it to multiprocessing... ugh ew. Again for anyone who ever reads this, I am lazy and don't copy this -- making too many assumptions about state (e.g. by attempting to transfer global variables) is dangerous and usually a very bad no good idea, and now is probably not an exception to that rule of thumb either.

In [3]:
def abuse_globals_to_get_chi2(filename):
    npzfile = np.load(filename)
    batch = npzfile['droplets']
    try:
        assert batch.shape[1] == frequencies.shape[0]
    except AssertionError:
        batch = batch.T
        assert batch.shape[1] == frequencies.shape[0]
    results = get_chi_squared_statistic(batch, frequencies, rate=2)
    del(batch) # Maybe help with memory management???
    return results

In [4]:
filenames = sorted(list(glob('simulation_results/npzfiles/*.npz')))
# make sure all files are actually valid and not corrupted
for filename in filenames:
    try:
        np.load(filename) # np.load is quick, even if accessing 'attribute' of npzfile can be slow
    except Exception as error_message:
        timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
        with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
            file_pointer.write('Problem with file `{}`.'.format(filename))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())        

We are going to open/load the results from each of the CNBDM simulations, compute the Pearson Categorical Divergence Statistics (usually called Pearson $\chi^2$ statistics, but that is a confusing/misleading name, especially in this context), also compute the approximate p-values assuming that the sampling distribution of the Pearson Categorical Divergence Statistic under the null multinomial distribution is approximately $\chi^2$-distributed (hint: in this case it's not).

In [5]:
%%time
try:
    pool = mp.Pool(35)
    # fairly clear argument on why chunksize = 1 makes sense for tasks with super small input
    # in this case a filename, and really slow tasks, https://stackoverflow.com/a/56337662/10634604
    results = pool.map(abuse_globals_to_get_chi2, filenames, chunksize=1)
    pool.close()
    pool.join()
except Exception as error_message:
    timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
    with open('error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
        file_pointer.write(str(error_message))
        file_pointer.write(traceback.format_exc())

CPU times: user 2.49 s, sys: 7.34 s, total: 9.83 s
Wall time: 3h 54min 48s


Then we save the results to disk, so we can load the Pearson Categorical Divergences later and compute their Monte Carlo approximate p values from an empirical sample of 1 Billion draws from the sampling distribution of the Pearson Categorical Divergence Statistic under the null multinomial distribution.

In [6]:
pearson_categorical_divergences, chi2_approx_pvals = list(zip(*results))
pearson_categorical_divergences = np.asarray(pearson_categorical_divergences)
chi2_approx_pvals = np.asarray(chi2_approx_pvals)

np.savez_compressed('categorical_divergences_and_chi2_approx_pvals.npz',
                   pearson_categorical_divergences=pearson_categorical_divergences,
                   chi2_approx_pvals=chi2_approx_pvals)

## Get Monte Carlo p values

In [7]:
from statsmodels.distributions.empirical_distribution import ECDF

Load our simulated empirical distribution of the Pearson categorical divergence statistic under the null multinomial distribution. (So basically a "Monte Carlo approximation to an exact Multinomial test for Pearson's $\chi^2$ statistic", using more standard terminology.)

In [8]:
mc_npzfile = np.load('../monte_carlo_results/complete_chi2_simulation.npz')
monte_carlo_vals = mc_npzfile['chi2_stats']
monte_carlo_vals.shape

(1000000000,)

In [9]:
monte_carlo_ecdf = ECDF(monte_carlo_vals)

The $p$-values correspond to the survival function (probability of being _more_ extreme, rather than CDF=no more extreme than), i.e. $1 - CDF$.

In [10]:
monte_carlo_pvals = 1. - monte_carlo_ecdf(pearson_categorical_divergences)

For easy/quick reference/use later

In [11]:
np.savez_compressed('monte_carlo_pvals.npz', monte_carlo_pvals=monte_carlo_pvals)