In [1]:
from glob import glob

filenames = sorted(list(glob('npzfiles/*.npz')))

In [2]:
import numpy as np
from simulations.concentrations.MLE import get_DM_score_function

from scipy.optimize import root_scalar, fsolve

In [3]:
import re
from zipfile import BadZipfile

base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

sim_num = re.findall(r'.*\.([0-9]+)\.npz', filenames[0])[0]

npzfile = np.load(filenames[0])
droplets = npzfile['sample_sizes'].T


In [4]:
np.max(droplets)

7

In [5]:
number_droplets, number_strains = droplets.shape
max_guess_value = 10000

small_val_size = 10000
small_val_iterations = number_droplets // small_val_size
small_val_results = np.zeros(small_val_iterations)

In [6]:
batch = droplets[18*small_val_size:(18+1)*small_val_size,:]
score_function = get_DM_score_function(batch, frequencies)

In [7]:
from simulations.concentrations import get_plugin_categorical_concentration
guess_value = get_plugin_categorical_concentration(batch)
guess_value

4.296417957920404

In [8]:
score_function(0.0001)

105737431.19639316

In [9]:
score_function(10000)

-1.0173600317742171e-07

In [10]:
result = root_scalar(f=score_function, x0=guess_value, bracket=(1./max_guess_value,max_guess_value))
result

      converged: True
           flag: 'converged'
 function_calls: 22
     iterations: 21
           root: 11.917356700752798

In [11]:
result.root

11.917356700752798

In [12]:
from analysis_utils.concentrations_MLE import get_DM_MLE

In [13]:
get_DM_MLE(filenames[0], "sample_sizes")


0

In [14]:
npzfile = np.load('concentration_ML_estimates/simulation.001.compositional.npz')

In [15]:
npzfile['whole_sim']

array([516.73282709])

In [16]:
npzfile['medium_batches']

array([           inf,            inf, 7.30162476e+01, 3.39161045e+02,
       8.36430875e+01, 9.95365691e+01,            inf, 4.23788645e+03,
       9.50128027e+01, 8.78253214e+02,            inf,            inf,
       4.85575955e+02,            inf, 1.34697076e+02,            inf,
       6.52615399e+01, 9.62728190e+01, 4.53917698e+01, 7.76616515e+10,
       1.15793190e+03, 8.59041842e+01,            inf,            inf,
                  inf, 1.28350615e+02,            inf, 0.00000000e+00,
       6.89646165e+01, 2.03302457e+02])

In [17]:
npzfile['small_batches']

array([           inf, 1.25681735e+01,            inf, ...,
                  inf, 2.31381190e+02, 2.30570308e+12])

## Example

In [18]:
score_function = get_DM_score_function(droplets, frequencies)

In [19]:
from scipy.optimize import fsolve

`fsolve` more general framework/wrapper, but seem to get better results using more 'targeted' options

In [20]:
fsolve(score_function, 2, full_output=True)

(array([516.73282891]),
 {'nfev': 36,
  'fjac': array([[-1.]]),
  'r': array([4.31996076e-06]),
  'qtf': array([-3.27418093e-10]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [21]:
root_scalar(f=score_function, x0=2, bracket=(0.000001,1000000))

      converged: True
           flag: 'converged'
 function_calls: 20
     iterations: 19
           root: 516.7328278430621

In [22]:
fsolve(score_function, 1, full_output=True)

(array([516.73282705]),
 {'nfev': 39,
  'fjac': array([[-1.]]),
  'r': array([4.32005953e-06]),
  'qtf': array([-4.29281499e-10]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [23]:
fsolve(score_function, 0.5, full_output=True)

(array([516.73282958]),
 {'nfev': 43,
  'fjac': array([[-1.]]),
  'r': array([3.90306789e-06]),
  'qtf': array([-2.18278728e-11]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [24]:
fsolve(score_function, 1.5, full_output=True)

(array([516.73282711]),
 {'nfev': 39,
  'fjac': array([[-1.]]),
  'r': array([8.67419388e-06]),
  'qtf': array([7.27595761e-12]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [25]:
fsolve(score_function, 1.75, full_output=True)

(array([516.73282741]),
 {'nfev': 38,
  'fjac': array([[-1.]]),
  'r': array([5.39308824e-06]),
  'qtf': array([7.27595761e-12]),
  'fvec': -7.275957614183426e-12},
 1,
 'The solution converged.')

In [26]:
root_scalar(f=score_function, x0=1.75, bracket=(0.000001,1000000))

      converged: True
           flag: 'converged'
 function_calls: 20
     iterations: 19
           root: 516.7328278430621

#### score function evaluated at some values

In [27]:
score_function(0.5)

501713.9220613763

In [28]:
score_function(1)

89800.38233843818

In [29]:
score_function(2)

14911.596816314384

In [30]:
score_function(200)

0.02254055555386003

In [31]:
score_function(2000)

-0.00011345713573973626

In [32]:
score_function(20000)

-1.5040113794384524e-06

## Does this really work in general?

In [56]:
get_DM_MLE(filenames[17], "sample_sizes")

0

In [57]:
npzfile = np.load('concentration_ML_estimates/simulation.018.compositional.npz')

In [58]:
npzfile['whole_sim']

array([5.49790962e+10])

In [59]:
npzfile['medium_batches']

array([           inf, 1.37297613e+02,            inf, 3.59550503e+02,
       0.00000000e+00,            inf, 1.44956133e+02,            inf,
       8.96158657e+01, 8.57715529e+01, 0.00000000e+00,            inf,
       1.33210143e+12, 6.71730089e+02, 1.14303504e+12, 1.11053861e+02,
       4.25871259e+11, 7.29490928e+01,            inf, 7.01992844e+02,
                  inf,            inf, 1.15167369e+02, 1.27123324e+02,
                  inf, 2.68243936e+11, 1.27234571e+03,            inf,
                  inf, 1.72830468e+02])

In [60]:
npzfile['small_batches']

array([           inf, 8.21552552e+11,            inf, ...,
       1.07830623e+01, 6.67634277e+01, 8.47030922e+00])

hmmm why is it $0$ for the whole batch? (because `fsolve` is terrible and gives a negative "root" and the one that isn't terrible requires the sign of the function to be different at both boundaries)

In [61]:
npzfile = np.load(filenames[17])
droplets = npzfile['sample_sizes'].T
score_function = get_DM_score_function(droplets, frequencies)

#### score function evaluated at some values

In [62]:
score_function(0.5)

502566.6229038462

In [63]:
score_function(1)

90152.2530817017

In [64]:
score_function(2)

15036.54386350885

In [65]:
score_function(200)

0.05804844376689289

In [66]:
score_function(2000)

0.00026503884328121785

In [67]:
score_function(20000)

2.3087261524779024e-06

In [68]:
score_function(200000)

2.2742028704669792e-08

In [69]:
score_function(2000000)

2.2707524749421282e-10

In [70]:
score_function(20000000)

2.270406085358445e-12

In [71]:
score_function(200000000)

2.2676305277968822e-14

In [72]:
score_function(2000000000)

2.2724877535296173e-16

In [73]:
score_function(20000000000)

2.168404344971009e-18

OK yeah this MLE really does seem to be basically exactly $\infty$. So why doesn't `fsolve` give that?

In [74]:
fsolve(score_function, x0=200000)

array([8.50405281e+10])

In [75]:
fsolve(score_function, x0=2000000)

array([2.00693122e+11])

In [76]:
fsolve(score_function, x0=20000000)

array([3.89442521e+10])

In [77]:
get_plugin_categorical_concentration(droplets)

92315.60025907237

In [78]:
fsolve(score_function, x0=92315.60025907237, full_output=True)

(array([5.49790962e+10]),
 {'nfev': 81,
  'fjac': array([[-1.]]),
  'r': array([0.]),
  'qtf': array([-2.71050543e-19]),
  'fvec': 2.710505431213761e-19},
 1,
 'The solution converged.')

In [79]:
fsolve(score_function, x0=5000, full_output=True)

(array([-9.35724194e+10]),
 {'nfev': 56,
  'fjac': array([[-1.]]),
  'r': array([-1.55515005e-22]),
  'qtf': array([-5.42101086e-20]),
  'fvec': 0.0},
 1,
 'The solution converged.')

^ OK, so the above is what happened when the maximum guess value was set to $10,000$. I really wish there was a way to force these root solvers to consider e.g. only non-negative solutions. At least the plugin estimator turned out to be a more useful initial guess. 

Anyway we know that something definitely went wrong whenever $0$ is returned as a value, so we can I guess at this point just exclude those results in the analysis (pending a determination of how common such problems are in the first place) rather than rewriting the procedure for deciding whether to use `fsolve` or `root_scalar` and/or determining `guess_value` and proposed brackets for `root_scalar`... (for now)