In [1]:
from glob import glob

filenames = sorted(list(glob('npzfiles/*.npz')))


#batch_size = 10000

#batch = droplets[0:batch_size,:]

In [2]:
import numpy as np
from simulations.concentrations.MLE import get_DM_score_function
from simulations.concentrations import get_plugin_categorical_concentration

from scipy.optimize import root_scalar

def get_DM_MLE(filename):

    base_relative_abundances = [1e-4, 1e-3, 1e-2]

    relative_abundances = [relative_abundance * number
                           for relative_abundance 
                           in base_relative_abundances
                           for number in (1,2,5) 
                           for repeat in range(10)]

    relative_abundances += [1-sum(relative_abundances)]
    frequencies = np.array(relative_abundances)
    
    npzfile = np.load(filename)
    droplets = npzfile['droplets']
    
    # sim_num = re.findall(r'.*\.([0-9]+)\.npz', filename)[0]
    
    score_function = get_DM_score_function(droplets, frequencies)
    # super inefficient b/c plugin also slow and would be better to
    # read results from a file because we already have them. but i'm lazy
    guess_value = get_plugin_categorical_concentration(droplets)
    
    result = root_scalar(f=score_function, x0=guess_value, bracket=(0.0001,10000))
    return result.root # want result to be scalar, not `RootResults`

In [3]:
result = get_DM_MLE(filenames[0])
result

1.0023681401920805

In [4]:
from analysis_utils.concentrations_MLE import get_DM_MLE as get_DM_MLE_test

In [5]:
get_DM_MLE_test(filenames[0])

0

In [6]:
npzfile = np.load('concentration_ML_estimates/simulation.001.compositional.npz')

In [7]:
npzfile['whole_sim']

array([1.00236814])

In [8]:
npzfile['medium_batches']

array([1.02003509, 0.98730531, 0.98860759, 1.00630211, 1.01041494,
       0.98960614, 0.98444663, 1.00385073, 0.98008344, 1.00067249,
       1.00599713, 0.99745964, 1.00313882, 1.01786759, 0.97836378,
       1.01731972, 1.03188787, 1.00777903, 0.98569626, 1.00735286,
       0.99193475, 1.02334047, 1.05297176, 1.02796612, 0.96807831,
       1.03114951, 1.01000441, 0.97286933, 0.97971741, 1.00048466])

In [9]:
npzfile['small_batches']

array([0.89583724, 1.0700831 , 1.21351298, ..., 0.89227021, 1.10216439,
       1.21770383])

ok so this seems to have mostly avoided the dumb problems with estimates on the order of $10^{13}$ or whatever due to `fsolve` numerical instability, lack of brackets, possibly weird shape of score function, etc., we had before, so probably good?

## Example

In [10]:
base_relative_abundances = [1e-4, 1e-3, 1e-2]

relative_abundances = [relative_abundance * number
                       for relative_abundance 
                       in base_relative_abundances
                       for number in (1,2,5) 
                       for repeat in range(10)]

relative_abundances += [1-sum(relative_abundances)]
frequencies = np.array(relative_abundances)

npzfile = np.load(filenames[0])
droplets = npzfile['droplets']

score_function = get_DM_score_function(droplets, frequencies)

In [11]:
from scipy.optimize import fsolve

`fsolve` more general framework/wrapper, but seem to get better results using more 'targeted' options

In [12]:
fsolve(score_function, 2, full_output=True)

(array([5.38201097e+13]),
 {'nfev': 108,
  'fjac': array([[-1.]]),
  'r': array([-1.66107607e-35]),
  'qtf': array([2.11758237e-22]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [13]:
root_scalar(f=score_function, x0=2, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 26
     iterations: 25
           root: 1.0023681401920805

In [14]:
result = root_scalar(f=score_function, x0=2, bracket=(0.0001,10000))

In [15]:
result.root

1.0023681401920805

In [16]:
float(result.root)

1.0023681401920805

In [17]:
float(result.root) == result.root

True

In [18]:
fsolve(score_function, 1, full_output=True)

(array([1.00236814]),
 {'nfev': 6,
  'fjac': array([[-1.]]),
  'r': array([87633.45240267]),
  'qtf': array([-2.71946192e-07]),
  'fvec': 3.725290298461914e-09},
 1,
 'The solution converged.')

In [19]:
fsolve(score_function, 0.5, full_output=True)

(array([1.00236814]),
 {'nfev': 12,
  'fjac': array([[-1.]]),
  'r': array([87634.48509771]),
  'qtf': array([-0.00033464]),
  'fvec': 0.0},
 1,
 'The solution converged.')

In [20]:
fsolve(score_function, 1.5, full_output=True)

(array([1.00236814]),
 {'nfev': 12,
  'fjac': array([[-1.]]),
  'r': array([87634.35139601]),
  'qtf': array([0.0002916]),
  'fvec': 3.725290298461914e-09},
 1,
 'The solution converged.')

In [21]:
fsolve(score_function, 1.75, full_output=True)

(array([5.21585152e+13]),
 {'nfev': 118,
  'fjac': array([[1.]]),
  'r': array([0.]),
  'qtf': array([-1.05879118e-22]),
  'fvec': -1.0587911840678754e-22},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [22]:
root_scalar(f=score_function, x0=1.75, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 26
     iterations: 25
           root: 1.0023681401920805

In [23]:
fsolve(score_function, 1.6, full_output=True)

(array([-19.31114052]),
 {'nfev': 9,
  'fjac': array([[-1.]]),
  'r': array([334.31966175]),
  'qtf': array([-2.72074249e-05]),
  'fvec': 1.1641532182693481e-09},
 1,
 'The solution converged.')

In [24]:
root_scalar(f=score_function, x0=1.6, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 26
     iterations: 25
           root: 1.0023681401920805

In [25]:
fsolve(score_function, 1.55, full_output=True)

(array([1.07923004e+13]),
 {'nfev': 124,
  'fjac': array([[-1.]]),
  'r': array([-4.36868911e-29]),
  'qtf': array([2.11758237e-21]),
  'fvec': -2.117582368135751e-21},
 5,
 'The iteration is not making good progress, as measured by the \n  improvement from the last ten iterations.')

In [26]:
root_scalar(f=score_function, x0=1.55, bracket=(0.0001,10000))

      converged: True
           flag: 'converged'
 function_calls: 26
     iterations: 25
           root: 1.0023681401920805

In [27]:
fsolve(score_function, 1.5, full_output=True)

(array([1.00236814]),
 {'nfev': 12,
  'fjac': array([[-1.]]),
  'r': array([87634.35139601]),
  'qtf': array([0.0002916]),
  'fvec': 3.725290298461914e-09},
 1,
 'The solution converged.')

OK so for `fsolve` to work it seems it has to be initialized very close to the starting value? Which may or may not be useless, depending on what my plugin estimators can do to help...

although using a better root-finding algorithm via `root_scalar` and the brackets seems to help, although adding the bracket sort of seems to be "cheating" but on the other hand we know it has to be greater than $0$, and $10,000$ is basically the same as $\infty$ for this problem, so I guess it's fine...

#### score function evaluated at some values

In [28]:
score_function(0.5)

246891.57295345515

In [29]:
score_function(1)

208.77755476534367

In [30]:
score_function(1.0010625)

114.7968273088336

In [31]:
score_function(1.003125)

-66.1993800252676

In [32]:
score_function(1.00625)

-336.8607820123434

In [33]:
score_function(1.0125)

-865.5136660858989

In [34]:
score_function(1.025)

-1874.0072534196079

In [35]:
score_function(1.05)

-3709.712809994817

In [36]:
score_function(1.1)

-6753.978840485215

In [37]:
score_function(1.2)

-10943.925206318498

In [38]:
score_function(2)

-14649.352417433634

In [39]:
score_function(200)

-7.264701052787132

it seems like the score function starts positive (asymptoting towards infinity perhaps as the values go towards 0), hits zero, goes negative, but then increases again, asymptoting towards zero (but never reaching in general?) from below

so the "second zero" "at infinity" for the score would correspond to a "local mininmum" of the log likelihood I think, i.e. we should be justified in disregarding it as a candidate _maximum_ likelihood estimate

## Does this really work in general?

In [40]:
get_DM_MLE_test(filenames[17])

0

In [41]:
npzfile = np.load('concentration_ML_estimates/simulation.018.compositional.npz')

In [42]:
npzfile['whole_sim']

array([1.00110548])

In [43]:
npzfile['medium_batches']

array([0.99962166, 0.99854617, 1.00385132, 1.01783741, 0.99672208,
       0.95983835, 1.03279495, 0.99124399, 0.98601127, 1.02110405,
       1.0002341 , 1.00035244, 0.99896262, 1.011135  , 1.00671617,
       0.98530922, 1.00510208, 1.0130035 , 0.97343806, 0.97354178,
       1.02753314, 0.971739  , 0.99445856, 1.02418862, 1.01462571,
       1.00462135, 1.01273396, 1.02186666, 0.99337153, 1.00144819])

In [44]:
npzfile['small_batches']

array([1.12011325, 1.04160977, 0.84260323, ..., 1.20838555, 0.96055808,
       1.01696832])

I guess so it seems.

Also I could probably divide the score function by the number of species because it's a redundant factor and maybe leads to increased numerical instability, but I don't think it really matters, and the code is probably slightly easier to understand if it attempts to calculate the exact score, rather than up to a constant factor