### Decoupled Data Parsers

In [None]:
import anndata
import os
import requests

save_path = "data/example_sce.h5ad"
if not os.path.exists(save_path):
    response = requests.get("https://go.wisc.edu/69435h")
    with open(save_path, "wb") as f:
        f.write(response.content)

example_sce = anndata.read_h5ad(save_path)
example_sce

In [None]:
from scdesigner.experimental.data import FormulaLoader

dl = FormulaLoader(example_sce, {"mu": "~ pseudotime", "alpha": "~ 1"}, batch_size=1000)
y, x = next(iter(dl.loader))
print(dl.names)
print(y, x)

### Generic Estimators

In [None]:
from scdesigner.experimental.estimators import NegativeBinomialML

dl = FormulaLoader(example_sce, {"mu": "~ pseudotime", "alpha": "~ 1"}, batch_size=1000)
ml = NegativeBinomialML({"lr": 0.01, "max_epochs": 10})
parameters = ml.estimate(dl.loader)

In [None]:
from scdesigner.experimental.samplers import NegativeBinomialSampler

sampler = NegativeBinomialSampler(parameters)
sampler.sample(dl.loader)

The more realistic case is when the loader only has covariate information, not the original training Y gene count assay.

In [None]:
dl = FormulaLoader(example_sce.obs, {"mu": "~ pseudotime", "alpha": "~ 1"}, batch_size=1000)
sampler.sample(dl.loader)

### Negative Controls

Here is a way of defining loaders with different covariates for different subsets of genes.

In [None]:
from scdesigner.experimental.data import CompositeFormulaLoader

sc1 = example_sce[:, :20].copy()
sc2 = example_sce[:, 20:].copy()

dl = CompositeFormulaLoader([sc1, sc2], [{"mu": "~ pseudotime", "alpha": "~ 1"}, {"mu": "~ 1", "alpha": "~ 1"}], batch_size=1000)

y, x = next(iter(dl.loader[0]))
print(y.shape)
print(x)
y, x = next(iter(dl.loader[1]))
print(y.shape)
print(x)

Now names is a list of tuples, each with gene names matched with regression parameters.

In [None]:
print(dl.names)

Now we can just loop over estimators for each subset of genes. We could provide a list of estimators if we want different model families.

In [None]:
from scdesigner.experimental.estimators import CompositeEstimator, NegativeBinomialML

ml = CompositeEstimator(NegativeBinomialML, {"lr": 0.01, "max_epochs": 10})
parameters = ml.estimate(dl.loader)

Sampling similarly loops over loader elements.

In [None]:
from scdesigner.experimental.samplers import CompositeSampler, NegativeBinomialSampler

sampler = CompositeSampler(parameters, NegativeBinomialSampler)
samples = sampler.sample(dl.loader)
[s.shape for s in samples]

We can remove the observed counts and only work with covariates.

In [None]:
dl = CompositeFormulaLoader([sc1.obs, sc2.obs], [{"mu": "~ pseudotime", "alpha": "~ 1"}, {"mu": "~ 1", "alpha": "~ 1"}], batch_size=1000)
sampler = CompositeSampler(parameters, NegativeBinomialSampler)
samples = sampler.sample(dl.loader)
[s.shape for s in samples]

We can also split genes in a dataset that's backed on disk. Note that we need to copy into separate subsets, because we need genuine anndata as input, not just views.

In [None]:
from scdesigner.experimental.data import BackedCompositeFormulaLoader

million = anndata.read_h5ad("data/million_cells.h5ad", backed="r")
dl = BackedCompositeFormulaLoader(
    [million[:, :100].copy("m1.h5ad"), million[:, 100:1000].copy("m2.h5ad")], 
    [{"mu": "~ cell_type", "alpha": "~ 1"}, {"mu": "~ 1", "alpha": "~ 1"}]
)

print(next(iter(dl.loader[0])))
print(next(iter(dl.loader[1])))