In [1]:
import numpy as np
npzfile = np.load('npzfiles/CPDM_results.001.npz')

In [32]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

In [2]:
droplets = npzfile['droplets']

In [3]:
droplets.shape

(15000000, 91)

In [4]:
test_batch = droplets[0:10000,...]
counts = np.sum(test_batch, axis=1)
cell_num_levels = np.unique(counts)
cell_num_levels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [5]:
stratified_droplets = [test_batch[counts == cell_num_level, :] for cell_num_level in cell_num_levels]

In [7]:
[stratum.shape for stratum in stratified_droplets]

[(1340, 91),
 (2704, 91),
 (2757, 91),
 (1772, 91),
 (883, 91),
 (372, 91),
 (119, 91),
 (36, 91),
 (12, 91),
 (4, 91),
 (1, 91)]

In [8]:
nonzero_droplets_count = np.sum([stratum.shape[0] for stratum in stratified_droplets[1:]])
nonzero_droplets_count

8660

In [9]:
10000 - 1340

8660

In [20]:
emp_freq_weights = [stratum.shape[0]/nonzero_droplets_count for stratum in stratified_droplets[1:]]
emp_freq_weights

[0.31224018475750576,
 0.31836027713625864,
 0.2046189376443418,
 0.10196304849884527,
 0.04295612009237875,
 0.01374133949191686,
 0.004157043879907622,
 0.0013856812933025404,
 0.00046189376443418013,
 0.00011547344110854503]

In [21]:
emp_means_strata = [np.mean(stratum, axis=0) for stratum in stratified_droplets[1:]]
emp_freqs_strata = [emp_means_stratum/n for n, emp_means_stratum in enumerate(emp_means_strata, 1)]

In [23]:
emp_freqs = np.sum([emp_freq_weight*emp_freqs_stratum 
        for emp_freq_weight, emp_freqs_stratum 
        in zip(emp_freq_weights, emp_freqs_strata)], axis=0)

In [24]:
def get_plugin_mult_cov(n, emp_freqs):
    plugin_mult_cov = -n*np.outer(emp_freqs, emp_freqs)
    np.fill_diagonal(plugin_mult_cov, n*emp_freqs*(1.-emp_freqs))
    return plugin_mult_cov

no dirichlet-multinomial over-dispersion for $n=1$, so skip that level

In [36]:
multi_cell_droplets_count = np.sum([stratum.shape[0] for stratum in stratified_droplets[2:]])
emp_overdisp_weights = [stratum.shape[0]/multi_cell_droplets_count for stratum in stratified_droplets[2:]]
emp_overdisp_weights

[0.46289456010745467,
 0.29751511081262594,
 0.14825386165211552,
 0.062458025520483546,
 0.019979852249832102,
 0.006044325050369375,
 0.0020147750167897917,
 0.000671591672263264,
 0.000167897918065816]

In [26]:
plugin_mult_covs = [get_plugin_mult_cov(n, emp_freqs) for n in range(2,11)]

In [29]:
def get_emp_cov(droplets_stratum, frequencies):
    try:
        _, number_frequencies = droplets_stratum.shape
        assert frequencies.size == number_frequencies
    except AssertionError:
        droplets_stratum = droplets_stratum.T # just makes a view, it should be cheap
        _, number_frequencies = droplets_stratum.shape
        assert frequencies.size == number_frequencies
        
    return np.cov(droplets_stratum.T, bias=True)

In [33]:
emp_covs = [get_emp_cov(droplets_stratum, frequencies)
           for droplets_stratum in stratified_droplets[2:]]

In [34]:
len(emp_covs)

9

In [39]:
def get_emp_overdisp(emp_cov, plugin_mult_cov):
    diff_matrix = emp_cov - plugin_mult_cov
    emp_overdisp = diff_matrix / (plugin_mult_cov + (plugin_mult_cov == 0))
    return emp_overdisp

In [40]:
emp_overdisp_mats = [get_emp_overdisp(emp_cov, plugin_mult_cov)
                    for emp_cov, plugin_mult_cov
                    in zip(emp_covs, plugin_mult_covs)]

In [41]:
emp_overdisp_mats[0]

array([[ 0.5318672 ,  0.        , -1.        , ...,  1.90661464,
         2.2803652 ,  1.93509772],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.        ,  0.        , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       ...,
       [ 1.90661464,  0.        , -1.        , ..., -0.05099508,
        -0.20445478,  0.01392072],
       [ 2.2803652 ,  0.        , -1.        , ..., -0.20445478,
         0.06047489, -0.31604295],
       [ 1.93509772,  0.        , -1.        , ...,  0.01392072,
        -0.31604295, -0.04033345]])

In [43]:
adjusted_emp_overdisp_mats = [emp_overdisp_mat/(n-1) 
                              for n, emp_overdisp_mat 
                              in enumerate(emp_overdisp_mats, 2)]

In [45]:
adjusted_emp_overdisp_mat = np.sum([emp_overdisp_weight*adjusted_emp_overdisp_mat
                                   for emp_overdisp_weight, adjusted_emp_overdisp_mat
                                   in zip(emp_overdisp_weights, adjusted_emp_overdisp_mats)], axis=0)
adjusted_emp_overdisp_mat

array([[ 1.91344724e-01,  0.00000000e+00, -6.82078363e-01, ...,
         1.58605902e+00,  2.84944545e-01,  1.63728002e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-6.82078363e-01,  0.00000000e+00, -6.82078363e-01, ...,
        -6.82078363e-01, -6.82078363e-01, -6.82078363e-01],
       ...,
       [ 1.58605902e+00,  0.00000000e+00, -6.82078363e-01, ...,
        -5.29395526e-04,  6.32561922e-02,  2.71811682e-02],
       [ 2.84944545e-01,  0.00000000e+00, -6.82078363e-01, ...,
         6.32561922e-02,  5.81525598e-02, -1.24067331e-01],
       [ 1.63728002e+00,  0.00000000e+00, -6.82078363e-01, ...,
         2.71811682e-02, -1.24067331e-01, -2.10756688e-02]])

In [46]:
np.sum(adjusted_emp_overdisp_mat <= 0)

2442

In [49]:
val = np.mean(adjusted_emp_overdisp_mat)
val

0.08889837439729358

In [50]:
1. / val

11.248799618437612

In [51]:
(1. / val) - 1

10.248799618437612

In [52]:
plugin_zeta = ((1. / val) - 1)/frequencies.size
plugin_zeta

0.11262417163118255

a little off from $1$, don't you think?

still this arguably actually OK, since it over-estimates heterogeneity, which is the more conservative thing to do so yeah

In [53]:
alt_val = np.mean(adjusted_emp_overdisp_mat[adjusted_emp_overdisp_mat > 0])
alt_plugin_zeta = ((1. / alt_val) - 1)/frequencies.size
alt_plugin_zeta

0.0004041411438299987

woah, that is way, way worse -- shouldn't it be better????????

no, b/c to get zeta larger and thus better in this case, its reciprocal, roughly the adjusted_emp_mat average, needs to be smaller. but you just made it larger...

In [57]:
def get_plugin_zeta(test_batch, frequencies):
    counts = np.sum(test_batch, axis=1)
    cell_num_levels = np.unique(counts)
    stratified_droplets = [test_batch[counts == cell_num_level, :] 
                           for cell_num_level in cell_num_levels]
    nonzero_droplets_count = np.sum([stratum.shape[0] 
                                     for stratum in stratified_droplets[1:]])
    emp_freq_weights = [stratum.shape[0]/nonzero_droplets_count 
                        for stratum in stratified_droplets[1:]]
    emp_means_strata = [np.mean(stratum, axis=0) 
                        for stratum in stratified_droplets[1:]]
    emp_freqs_strata = [emp_means_stratum/n 
                        for n, emp_means_stratum 
                        in enumerate(emp_means_strata, 1)]
    emp_freqs = np.sum([emp_freq_weight*emp_freqs_stratum 
            for emp_freq_weight, emp_freqs_stratum 
            in zip(emp_freq_weights, emp_freqs_strata)], axis=0)
    multi_cell_droplets_count = np.sum([stratum.shape[0] 
                                        for stratum 
                                        in stratified_droplets[2:]])
    emp_overdisp_weights = [stratum.shape[0]/multi_cell_droplets_count 
                            for stratum in stratified_droplets[2:]]
    plugin_mult_covs = [get_plugin_mult_cov(n, emp_freqs) for n 
                        in cell_num_levels if n >= 2]
    emp_covs = [get_emp_cov(droplets_stratum, frequencies)
           for droplets_stratum in stratified_droplets[2:]]
    emp_overdisp_mats = [get_emp_overdisp(emp_cov, plugin_mult_cov)
                        for emp_cov, plugin_mult_cov
                        in zip(emp_covs, plugin_mult_covs)]
    adjusted_emp_overdisp_mats = [emp_overdisp_mat/(n-1) 
                                  for n, emp_overdisp_mat 
                                  in enumerate(emp_overdisp_mats, 2)]
    adjusted_emp_overdisp_mat = np.sum([emp_overdisp_weight*adjusted_emp_overdisp_mat
                                       for emp_overdisp_weight, adjusted_emp_overdisp_mat
                                       in zip(emp_overdisp_weights, adjusted_emp_overdisp_mats)], axis=0)
    # mean is L2 projection onto space of matrices where all entries have same value
    plugin_zeta = ((1. / np.mean(adjusted_emp_overdisp_mat)) - 1)/frequencies.size
    return plugin_zeta

In [58]:
get_plugin_zeta(test_batch, frequencies)

0.11262417163118255

In [59]:
get_plugin_zeta(droplets[0:20000,:], frequencies)

0.18630815460288355

In [60]:
get_plugin_zeta(droplets[0:30000,:], frequencies)

0.11184831887804807

In [61]:
get_plugin_zeta(droplets[0:40000,:], frequencies)

0.14694108244148066

In [62]:
get_plugin_zeta(droplets[0:50000,:], frequencies)

0.14275609291093488

In [63]:
get_plugin_zeta(droplets[0:100000,:], frequencies)

-0.4608528111563568

In [64]:
get_plugin_zeta(droplets[0:200000,:], frequencies)

-0.17786879463194513

In [65]:
get_plugin_zeta(droplets[0:500000,:], frequencies)

-0.2589818246618423

In [66]:
get_plugin_zeta(droplets[0:1000000,:], frequencies)

-0.17805259690774225

In [67]:
get_plugin_zeta(droplets[0:2000000,:], frequencies)

-0.1987654735787599

In [68]:
get_plugin_zeta(droplets[0:5000000,:], frequencies)

-0.569726285551785

In [69]:
get_plugin_zeta(droplets[0:10000000,:], frequencies)

-2.994871085959598

In [70]:
get_plugin_zeta(droplets[0:15000000,:], frequencies)

6.69671911220051

hate to be the bearer of bad news here, but this is pretty obvious -- if this thing is consistent (which LOL it probably isn't, seriously 15 million droplets and it _still_ isn't close) it converges _really_ slowly.

In [72]:
np.sum(np.outer(emp_freqs, emp_freqs))

1.0

ok maybe the problem here is treating all entries of the empirical over-dispersion matrix as if they were all equally reliable, when in fact (e.g. especially in this case) most of them might be very unreliable. so like let's semi-arbitrarily weight the contribution of each entry to the average by the product of empirical frequencies, i.e. use a weighted average

In [73]:
adjusted_emp_overdisp_mat.shape

(91, 91)

In [74]:
np.outer(emp_freqs, emp_freqs).shape

(91, 91)

In [75]:
adjusted_emp_overdisp_mat

array([[ 1.91344724e-01,  0.00000000e+00, -6.82078363e-01, ...,
         1.58605902e+00,  2.84944545e-01,  1.63728002e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-6.82078363e-01,  0.00000000e+00, -6.82078363e-01, ...,
        -6.82078363e-01, -6.82078363e-01, -6.82078363e-01],
       ...,
       [ 1.58605902e+00,  0.00000000e+00, -6.82078363e-01, ...,
        -5.29395526e-04,  6.32561922e-02,  2.71811682e-02],
       [ 2.84944545e-01,  0.00000000e+00, -6.82078363e-01, ...,
         6.32561922e-02,  5.81525598e-02, -1.24067331e-01],
       [ 1.63728002e+00,  0.00000000e+00, -6.82078363e-01, ...,
         2.71811682e-02, -1.24067331e-01, -2.10756688e-02]])

In [76]:
adjusted_emp_overdisp_mat * np.outer(emp_freqs, emp_freqs)

array([[ 2.68057794e-09,  0.00000000e+00, -9.32228454e-09, ...,
         9.44026332e-06,  1.67418036e-06,  2.17596328e-05],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-9.32228454e-09,  0.00000000e+00, -9.09491174e-09, ...,
        -3.96072940e-06, -3.90977985e-06, -8.84380195e-06],
       ...,
       [ 9.44026332e-06,  0.00000000e+00, -3.96072940e-06, ...,
        -1.33874496e-06,  1.57905683e-04,  1.53479204e-04],
       [ 1.67418036e-06,  0.00000000e+00, -3.90977985e-06, ...,
         1.57905683e-04,  1.43298183e-04, -6.91537902e-04],
       [ 2.17596328e-05,  0.00000000e+00, -8.84380195e-06, ...,
         1.53479204e-04, -6.91537902e-04, -2.65721454e-04]])

In [77]:
np.sum(adjusted_emp_overdisp_mat * np.outer(emp_freqs, emp_freqs))

0.009593672628945674

In [78]:
new_val = np.sum(adjusted_emp_overdisp_mat * np.outer(emp_freqs, emp_freqs))
1. / new_val

104.23536831795124

In [79]:
((1. / new_val) - 1) / frequencies.size

1.134454596900563

HAHA LOL WUT? that actually worked? SERIOUSLY?????

this has to be a fluke or something...

In [80]:
def new_get_plugin_zeta(test_batch, frequencies):
    counts = np.sum(test_batch, axis=1)
    cell_num_levels = np.unique(counts)
    stratified_droplets = [test_batch[counts == cell_num_level, :] 
                           for cell_num_level in cell_num_levels]
    nonzero_droplets_count = np.sum([stratum.shape[0] 
                                     for stratum in stratified_droplets[1:]])
    emp_freq_weights = [stratum.shape[0]/nonzero_droplets_count 
                        for stratum in stratified_droplets[1:]]
    emp_means_strata = [np.mean(stratum, axis=0) 
                        for stratum in stratified_droplets[1:]]
    emp_freqs_strata = [emp_means_stratum/n 
                        for n, emp_means_stratum 
                        in enumerate(emp_means_strata, 1)]
    emp_freqs = np.sum([emp_freq_weight*emp_freqs_stratum 
            for emp_freq_weight, emp_freqs_stratum 
            in zip(emp_freq_weights, emp_freqs_strata)], axis=0)
    multi_cell_droplets_count = np.sum([stratum.shape[0] 
                                        for stratum 
                                        in stratified_droplets[2:]])
    emp_overdisp_weights = [stratum.shape[0]/multi_cell_droplets_count 
                            for stratum in stratified_droplets[2:]]
    plugin_mult_covs = [get_plugin_mult_cov(n, emp_freqs) for n 
                        in cell_num_levels if n >= 2]
    emp_covs = [get_emp_cov(droplets_stratum, frequencies)
           for droplets_stratum in stratified_droplets[2:]]
    emp_overdisp_mats = [get_emp_overdisp(emp_cov, plugin_mult_cov)
                        for emp_cov, plugin_mult_cov
                        in zip(emp_covs, plugin_mult_covs)]
    adjusted_emp_overdisp_mats = [emp_overdisp_mat/(n-1) 
                                  for n, emp_overdisp_mat 
                                  in enumerate(emp_overdisp_mats, 2)]
    adjusted_emp_overdisp_mat = np.sum([emp_overdisp_weight*adjusted_emp_overdisp_mat
                                       for emp_overdisp_weight, adjusted_emp_overdisp_mat
                                       in zip(emp_overdisp_weights, adjusted_emp_overdisp_mats)], axis=0)
    # mean is L2 projection onto space of matrices where all entries have same value
    # but use weighted mean since values estimated from more frequent strains/counts 
    # are more reliable. honestly don't know how to mathematically justify
    emp_freq_weighted_AEOM = adjusted_emp_overdisp_mat * np.outer(emp_freqs, emp_freqs)
    plugin_zeta = ((1. / np.sum(emp_freq_weighted_AEOM) ) - 1)/frequencies.size
    return plugin_zeta

In [81]:
new_get_plugin_zeta(droplets[0:10000,:], frequencies)

1.134454596900563

ok that has to be a fluke though

In [82]:
new_get_plugin_zeta(droplets[0:20000,:], frequencies)

1.0822722165639544

In [83]:
new_get_plugin_zeta(droplets[0:30000,:], frequencies)

1.1400827971528311

In [84]:
new_get_plugin_zeta(droplets[0:40000,:], frequencies)

1.2709309199361232

In [85]:
new_get_plugin_zeta(droplets[0:50000,:], frequencies)

1.1896302567166346

In [86]:
new_get_plugin_zeta(droplets[0:60000,:], frequencies)

1.099037207934122

In [87]:
new_get_plugin_zeta(droplets[0:70000,:], frequencies)

1.0417020636245855

In [88]:
new_get_plugin_zeta(droplets[0:80000,:], frequencies)

1.0664689724818865

In [89]:
new_get_plugin_zeta(droplets[0:90000,:], frequencies)

1.1100359280808836

In [90]:
new_get_plugin_zeta(droplets[0:100000,:], frequencies)

1.0843299756802531

In [91]:
new_get_plugin_zeta(droplets[0:110000,:], frequencies)

1.1025012386263806

In [92]:
new_get_plugin_zeta(droplets[0:120000,:], frequencies)

1.1215346883384962

In [93]:
new_get_plugin_zeta(droplets[0:130000,:], frequencies)

1.1126846654191904

In [94]:
new_get_plugin_zeta(droplets[0:140000,:], frequencies)

1.1000062751658402

In [95]:
new_get_plugin_zeta(droplets[0:150000,:], frequencies)

1.0964840616026892

In [96]:
new_get_plugin_zeta(droplets[0:160000,:], frequencies)

1.1044324316411567

In [97]:
new_get_plugin_zeta(droplets[0:170000,:], frequencies)

1.1245742792441331

In [98]:
new_get_plugin_zeta(droplets[0:180000,:], frequencies)

1.1011614608901856

In [99]:
new_get_plugin_zeta(droplets[0:190000,:], frequencies)

1.0736718448656837

In [100]:
new_get_plugin_zeta(droplets[0:200000,:], frequencies)

1.0661400875725224

In [101]:
new_get_plugin_zeta(droplets[0:210000,:], frequencies)

1.0597069259387144

In [102]:
new_get_plugin_zeta(droplets[0:220000,:], frequencies)

1.05862416830431

In [103]:
new_get_plugin_zeta(droplets[0:230000,:], frequencies)

1.0384137324689229

In [104]:
new_get_plugin_zeta(droplets[0:240000,:], frequencies)

1.0390706721361922

In [105]:
new_get_plugin_zeta(droplets[0:250000,:], frequencies)

1.0606553241739383

In [106]:
new_get_plugin_zeta(droplets[0:260000,:], frequencies)

1.0584646941252471

In [107]:
new_get_plugin_zeta(droplets[0:270000,:], frequencies)

1.0543655159286065

In [108]:
new_get_plugin_zeta(droplets[0:280000,:], frequencies)

1.0516140637626397

In [109]:
new_get_plugin_zeta(droplets[0:290000,:], frequencies)

1.0537099472424285

In [110]:
new_get_plugin_zeta(droplets[0:300000,:], frequencies)

1.0426646899384917

In [111]:
new_get_plugin_zeta(droplets[0:310000,:], frequencies)

1.0523309971845445

In [112]:
new_get_plugin_zeta(droplets[0:320000,:], frequencies)

1.0533969196483695

In [113]:
new_get_plugin_zeta(droplets[0:330000,:], frequencies)

1.0637941010651364

In [114]:
new_get_plugin_zeta(droplets[0:340000,:], frequencies)

1.0678763570197385

In [115]:
new_get_plugin_zeta(droplets[0:350000,:], frequencies)

1.0543655885214693

In [116]:
new_get_plugin_zeta(droplets[0:360000,:], frequencies)

1.061391883228527

In [117]:
new_get_plugin_zeta(droplets[0:370000,:], frequencies)

1.0605208858608948

In [118]:
new_get_plugin_zeta(droplets[0:380000,:], frequencies)

1.059278932538394

In [119]:
new_get_plugin_zeta(droplets[0:390000,:], frequencies)

1.0565545167930046

In [120]:
new_get_plugin_zeta(droplets[0:400000,:], frequencies)

1.0595881998916592

In [121]:
new_get_plugin_zeta(droplets[0:410000,:], frequencies)

1.06921679022446

In [122]:
new_get_plugin_zeta(droplets[0:420000,:], frequencies)

1.0610640932087763

In [123]:
new_get_plugin_zeta(droplets[0:430000,:], frequencies)

1.0548458919658756

In [124]:
new_get_plugin_zeta(droplets[0:440000,:], frequencies)

1.04543974432081

In [125]:
new_get_plugin_zeta(droplets[0:450000,:], frequencies)

1.0486746524273716

In [126]:
new_get_plugin_zeta(droplets[0:460000,:], frequencies)

1.0550265412382527

In [127]:
new_get_plugin_zeta(droplets[0:470000,:], frequencies)

1.058702002729458

In [128]:
new_get_plugin_zeta(droplets[0:480000,:], frequencies)

1.0503337318762345

In [129]:
new_get_plugin_zeta(droplets[0:490000,:], frequencies)

1.0499050091827637

In [130]:
new_get_plugin_zeta(droplets[0:500000,:], frequencies)

1.0447044788701565

In [131]:
new_get_plugin_zeta(droplets[0:600000,:], frequencies)

1.0169295407971528

In [132]:
new_get_plugin_zeta(droplets[0:700000,:], frequencies)

1.000643759370765

In [133]:
new_get_plugin_zeta(droplets[0:800000,:], frequencies)

1.0039641653694344

In [134]:
new_get_plugin_zeta(droplets[0:900000,:], frequencies)

1.0195658492858672

In [135]:
new_get_plugin_zeta(droplets[0:1000000,:], frequencies)

1.0247449104358712

In [136]:
new_get_plugin_zeta(droplets[0:2000000,:], frequencies)

1.0008244990602349

In [137]:
new_get_plugin_zeta(droplets[0:3000000,:], frequencies)

1.0068004695684536

In [138]:
new_get_plugin_zeta(droplets[0:4000000,:], frequencies)

1.0074033987731825

In [139]:
new_get_plugin_zeta(droplets[0:5000000,:], frequencies)

1.0021815908187313

In [140]:
new_get_plugin_zeta(droplets[0:6000000,:], frequencies)

0.9967495552654911

In [141]:
new_get_plugin_zeta(droplets[0:7000000,:], frequencies)

0.9974630496849535

In [142]:
new_get_plugin_zeta(droplets[0:8000000,:], frequencies)

0.9968618267597446

In [143]:
new_get_plugin_zeta(droplets[0:9000000,:], frequencies)

0.9972672131120466

In [144]:
new_get_plugin_zeta(droplets[0:10000000,:], frequencies)

0.9932189106107836

In [145]:
new_get_plugin_zeta(droplets[0:11000000,:], frequencies)

0.9939726543601701

In [146]:
new_get_plugin_zeta(droplets[0:12000000,:], frequencies)

0.9968590105838143

In [147]:
new_get_plugin_zeta(droplets[0:13000000,:], frequencies)

0.9966277654762601

In [148]:
new_get_plugin_zeta(droplets[0:14000000,:], frequencies)

0.9941224205527277

In [149]:
new_get_plugin_zeta(droplets, frequencies)

0.9945878639784164

OK so wow this actually _does_ seem to be the genuine article. I don't know how to mathematically _prove_ why it works (assuming it does in fact work), but boy it sure does _seem_ like it works, doesn't it? you know.

anyway from here, yeah let's just run this on all 500 simulations, and then also for each of the 500 simulations, run it on all contiguous subsets of 500,000 droplets (and maybe even just going in overkill for the heck of it, also all contiguous subsets of 10,000 droplets), and then plot those histograms. Hopefully we'll get good results!