In [1]:
from glob import glob

filenames = sorted(list(glob('npzfiles/*.npz')))

In [2]:
import numpy as np
from simulations.concentrations.MLE import get_DM_score_function

from scipy.optimize import root_scalar, fsolve

In [3]:
import re
from zipfile import BadZipfile

base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

sim_num = re.findall(r'.*\.([0-9]+)\.npz', filenames[0])[0]

npzfile = np.load(filenames[0])
droplets = npzfile['droplets']


In [4]:
np.max(droplets)

942

^ OK so that explains why the code is messed up/doesn't work well -- the process of trying to compute the weights creates a dictionary where each entry is a huge boolean corresponding to each observed strain specific number. I was thinking that `strain_specific_M` would be at most like 10, but if it's 942 as in this case -- well obviously it works out poorly.

like imagine trying to store $942$ boolean arrays of size 15 million by 91 or whatever in memory -- of course there would be a memory leak...

In [5]:
number_droplets, number_strains = droplets.shape
max_guess_value = 10000

small_val_size = 10000
small_val_iterations = number_droplets // small_val_size
small_val_results = np.zeros(small_val_iterations)

In [6]:
batch = droplets[18*small_val_size:(18+1)*small_val_size,:]
score_function = get_DM_score_function(batch, frequencies)

In [7]:
from simulations.concentrations import get_plugin_categorical_concentration
guess_value = get_plugin_categorical_concentration(batch)
guess_value

0.28940260440232574

In [8]:
score_function(0.0001)

97660422.36644673

In [9]:
score_function(10000)

-8.178975656747411e-06

In [10]:
result = root_scalar(f=score_function, x0=guess_value, bracket=(1./max_guess_value,max_guess_value))
result

      converged: True
           flag: 'converged'
 function_calls: 27
     iterations: 26
           root: 0.2597639822868309

In [11]:
result.root

0.2597639822868309

In [12]:
import re

from zipfile import BadZipfile

import numpy as np
from simulations.concentrations.MLE import get_DM_score_function
from simulations.concentrations import get_plugin_categorical_concentration

from scipy.optimize import root_scalar, fsolve

def get_DM_MLE(filename):

    base_relative_abundances = [1e-4, 1e-3, 1e-2]

    relative_abundances = [relative_abundance * number
                           for relative_abundance 
                           in base_relative_abundances
                           for number in (1,2,5) 
                           for repeat in range(10)]

    relative_abundances += [1-sum(relative_abundances)]
    frequencies = np.array(relative_abundances)

    sim_num = re.findall(r'.*\.([0-9]+)\.npz', filename)[0]
    try:
        results_file = np.load('concentration_ML_estimates/simulation.{}.compositional.npz'.format(sim_num))
        assert set(results_file.files) == {"small_batches", "medium_batches", "whole_sim"}
    except (BadZipfile, FileNotFoundError, AssertionError):
        npzfile = np.load(filename)
        droplets = npzfile['droplets']

        number_droplets, number_strains = droplets.shape
        max_guess_value = 10000

        small_val_size = 10000
        small_val_iterations = number_droplets // small_val_size
        small_val_results = np.zeros(small_val_iterations)

        print("beginning small batches")
        
        for iter_num in range(small_val_iterations):
            print("small batch #{}".format(iter_num+1))
            batch = droplets[iter_num*small_val_size:(iter_num+1)*small_val_size,:]
            score_function = get_DM_score_function(batch, frequencies)
            # super inefficient b/c plugin also slow and would be better to
            # read results from a file because we already have them. but i'm lazy
            guess_value = get_plugin_categorical_concentration(batch)
            if guess_value >= max_guess_value:
                guess_value = max_guess_value/2.
            # do f(a) and f(b) have the same signs? if so then scipy won't let
            # you use bracketed method. kind of dumb, like a root might still exist
            if (score_function(1./max_guess_value) > 0) == (score_function(max_guess_value) > 0):
                result = fsolve(func=score_function, x0=guess_value, full_output=True)
                # if exit code is not 1 (successful) then usually means it failed to converge
                # after many steps because "root" is at infinity (usually, but you know sometimes 
                # there's numerical instability and yadda yadda). so if the root-finding algorithm 
                # exited unsuccessfully and the value is large, assume that's what happened.
                # https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fsolve.html
                if result[2] != 1 and result[0] > max_guess_value:
                    result = np.inf
                else:
                    result = result[0]
            else:
                result = root_scalar(f=score_function, x0=guess_value, bracket=(1./max_guess_value,max_guess_value))
                result = result.root # want result to be scalar, not `RootResults`
            small_val_results[iter_num] = max(result, 0) # negative results obviously nonsensical

        print("beginning medium batches")
            
        med_val_size = 500000
        med_val_iterations = number_droplets // med_val_size
        med_val_results = np.zeros(med_val_iterations)

        for iter_num in range(med_val_iterations):
            print("medium batch #{}".format(iter_num+1))
            batch = droplets[iter_num*med_val_size:(iter_num+1)*med_val_size,:]
            score_function = get_DM_score_function(batch, frequencies)
            # super inefficient b/c plugin also slow and would be better to
            # read results from a file because we already have them. but i'm lazy
            guess_value = get_plugin_categorical_concentration(batch)
            ### see analogous comments above for small batches
            if guess_value >= max_guess_value:
                guess_value = max_guess_value/2.
            if (score_function(1./max_guess_value) > 0) == (score_function(max_guess_value) > 0):
                result = fsolve(func=score_function, x0=guess_value, full_output=True)
                if result[2] != 1 and result[0] > max_guess_value:
                    result = np.inf
                else:
                    result = result[0]
            else:
                result = root_scalar(f=score_function, x0=guess_value, bracket=(1./max_guess_value,max_guess_value))
                result = result.root # want result to be scalar, not `RootResults`
            ###
            med_val_results[iter_num] = max(result, 0)

        print("beginning whole batch")
        whole_sim_results = np.zeros(1)
        score_function = get_DM_score_function(droplets, frequencies)
        # super inefficient b/c plugin also slow and would be better to
        # read results from a file because we already have them. but i'm lazy
        guess_value = get_plugin_categorical_concentration(droplets)
        ### see analogous comments above for small batches
        if guess_value >= max_guess_value:
            guess_value = max_guess_value/2.
        if (score_function(1./max_guess_value) > 0) == (score_function(max_guess_value) > 0):
            result = fsolve(func=score_function, x0=guess_value, full_output=True)
            if result[2] != 1 and result[0] > max_guess_value:
                result = np.inf
            else:
                result = result[0]
        else:
            result = root_scalar(f=score_function, x0=guess_value, bracket=(1./max_guess_value,max_guess_value))
            result = result.root # want result to be scalar, not `RootResults`
        ###
        whole_sim_results[0] = max(result, 0)

        results = {"small_batches":small_val_results, "medium_batches":med_val_results, "whole_sim":whole_sim_results}
        np.savez_compressed('concentration_ML_estimates/simulation.{}.compositional.npz'.format(sim_num), **results)
        
    # give map/starmap something to chew on
    return 0

In [13]:
get_DM_MLE(filenames[0])


0

In [14]:
npzfile = np.load('concentration_ML_estimates/simulation.001.compositional.npz')

In [15]:
npzfile['whole_sim']

array([0.23840612])

In [16]:
npzfile['medium_batches']

array([0.23592181, 0.23631105, 0.23843325, 0.23893931, 0.24396309,
       0.23973328, 0.23492486, 0.23879237, 0.23860313, 0.2418561 ,
       0.24000792, 0.23553467, 0.23662264, 0.237833  , 0.23748663,
       0.23936216, 0.23710567, 0.23920434, 0.23791687, 0.23909062,
       0.23958914, 0.23989035, 0.23860835, 0.2346297 , 0.23635453,
       0.24029144, 0.23928228, 0.23977637, 0.23839106, 0.2383543 ])

In [17]:
npzfile['small_batches']

array([0.21355107, 0.23044241, 0.23757938, ..., 0.20770413, 0.24851112,
       0.21896893])

## Example

In [18]:
score_function = get_DM_score_function(droplets, frequencies)

In [19]:
from scipy.optimize import fsolve

`fsolve` more general framework/wrapper, but seem to get better results using more 'targeted' options

In [20]:
fsolve(score_function, 2, full_output=True)

(array([7.84515051e+13]),
 {'nfev': 121,
  'fjac': array([[-1.]]),
  'r': array([5.9387686e-32]),
  'qtf': array([3.70576914e-22]),
  'fvec': -3.705769144237564e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [21]:
root_scalar(f=score_function, x0=2, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 28
     iterations: 27
           root: 0.23840612199479064

In [22]:
result = root_scalar(f=score_function, x0=2, bracket=(0.0001,10000))

In [23]:
result.root

0.23840612199479064

In [24]:
float(result.root)

0.23840612199479064

In [25]:
float(result.root) == result.root

True

In [26]:
fsolve(score_function, 1, full_output=True)

(array([4.7868244e+13]),
 {'nfev': 121,
  'fjac': array([[-1.]]),
  'r': array([0.]),
  'qtf': array([9.52912066e-22]),
  'fvec': -9.529120656610879e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [27]:
fsolve(score_function, 0.5, full_output=True)

(array([9.60615871e+13]),
 {'nfev': 124,
  'fjac': array([[-1.]]),
  'r': array([0.]),
  'qtf': array([1.58818678e-22]),
  'fvec': -1.5881867761018131e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [28]:
fsolve(score_function, 1.5, full_output=True)

(array([1.9400321e+14]),
 {'nfev': 126,
  'fjac': array([[1.]]),
  'r': array([0.]),
  'qtf': array([-2.64697796e-23]),
  'fvec': -2.6469779601696886e-23},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [29]:
fsolve(score_function, 1.75, full_output=True)

(array([1.04371265e+14]),
 {'nfev': 123,
  'fjac': array([[-1.]]),
  'r': array([-5.67319893e-29]),
  'qtf': array([1.58818678e-22]),
  'fvec': -1.5881867761018131e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [30]:
root_scalar(f=score_function, x0=1.75, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 28
     iterations: 27
           root: 0.23840612199479064

In [31]:
fsolve(score_function, 1.6, full_output=True)

(array([9.29088973e+13]),
 {'nfev': 122,
  'fjac': array([[1.]]),
  'r': array([0.]),
  'qtf': array([-2.64697796e-22]),
  'fvec': -2.6469779601696886e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [32]:
root_scalar(f=score_function, x0=1.6, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 28
     iterations: 27
           root: 0.23840612199479064

In [33]:
fsolve(score_function, 1.55, full_output=True)

(array([5.89054998e+13]),
 {'nfev': 121,
  'fjac': array([[-1.]]),
  'r': array([-7.91777131e-32]),
  'qtf': array([6.3527471e-22]),
  'fvec': -6.352747104407253e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [34]:
root_scalar(f=score_function, x0=1.55, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 28
     iterations: 27
           root: 0.23840612199479064

In [35]:
fsolve(score_function, 1.5, full_output=True)

(array([1.9400321e+14]),
 {'nfev': 126,
  'fjac': array([[1.]]),
  'r': array([0.]),
  'qtf': array([-2.64697796e-23]),
  'fvec': -2.6469779601696886e-23},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

#### score function evaluated at some values

In [36]:
score_function(0.5)

-580011.5045608133

In [37]:
score_function(1)

-316397.2286059186

In [38]:
score_function(1.0010625)

-316009.0543329306

In [39]:
score_function(1.003125)

-315257.51304878294

In [40]:
score_function(1.00625)

-314123.7586482689

In [41]:
score_function(1.0125)

-311873.99563478306

In [42]:
score_function(1.025)

-307444.4941312745

In [43]:
score_function(1.05)

-298858.0084988624

In [44]:
score_function(1.1)

-282716.14265051484

In [45]:
score_function(1.2)

-254117.16782980412

In [46]:
score_function(2)

-128182.58794686198

In [47]:
score_function(200)

-53.10248191781284

## Does this really work in general?

In [48]:
get_DM_MLE(filenames[17])

0

In [49]:
npzfile = np.load('concentration_ML_estimates/simulation.018.compositional.npz')

In [50]:
npzfile['whole_sim']

array([0.23898442])

In [51]:
npzfile['medium_batches']

array([0.24057111, 0.24144063, 0.23588789, 0.24084832, 0.23695048,
       0.23572823, 0.23643363, 0.2422181 , 0.23788786, 0.23618437,
       0.24225318, 0.24059861, 0.24241563, 0.23512794, 0.23625867,
       0.24018598, 0.23920259, 0.23759581, 0.23793983, 0.23969552,
       0.2433667 , 0.24258299, 0.2398957 , 0.23620528, 0.24051825,
       0.23606208, 0.23798365, 0.24368035, 0.23657092, 0.23819963])

In [52]:
npzfile['small_batches']

array([0.21706371, 0.2549846 , 0.2149824 , ..., 0.24986367, 0.23579221,
       0.23774658])