In [1]:
import anndata
import os
import requests

save_path = "data/example_sce.h5ad"
if not os.path.exists(save_path):
    response = requests.get("https://go.wisc.edu/69435h")
    with open(save_path, "wb") as f:
        f.write(response.content)

example_sce = anndata.read_h5ad(save_path)
example_sce

AnnData object with n_obs × n_vars = 2087 × 100
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score', 'cell_type', 'sizeFactor', 'pseudotime'
    var: 'highly_variable_genes'
    uns: 'X_name', 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'PCA', 'UMAP', 'X_pca', 'X_umap'
    layers: 'counts', 'cpm', 'logcounts', 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'

The memento model works directly off the matrix of transcript counts. Therefore, we don't need to keep track of the cell-level metadata that are used as predictors in other models. This implementation loads all data at once, though the fact that it works off sparse matrices means that it still is quite memory efficient.


In [2]:
from scdesigner.experimental.estimators import MementoEstimator
from scdesigner.experimental.data import SparseMatrixLoader

memento = MementoEstimator(q=0.01)
sml = SparseMatrixLoader(example_sce, batch_size=1000)
fit = memento.estimate(sml.loader)

In [19]:
print(fit["mean"][:5].round(1))
print(fit["norm_cov"][:5, :5].round(1))

[554.5 382.3 503.5 517.2 410.2]
[[ 2.6 -0.1  0.1  0.5  1.1]
 [-0.1  0.4  0.   0.  -0. ]
 [ 0.1  0.   0.4 -0.   0. ]
 [ 0.5  0.  -0.   0.6  0.2]
 [ 1.1 -0.   0.   0.2  1. ]]


In [5]:
from scdesigner.experimental.samplers.memento import MementoSampler

sampler = MementoSampler(fit)
y_sim = sampler.sample(sml.loader)
y_sim.shape

(2087, 100)

Here's an example sampling a new dataset that's larger than the one we used for estimation. We first create a loader with the dimension we want as output.

In [12]:
import numpy as np
import scipy.sparse

x_tmp = scipy.sparse.csr_matrix(np.zeros((10000, 1)), shape=(10000, 1))
dummy_data = anndata.AnnData(X=x_tmp)
dummy_loader = SparseMatrixLoader(dummy_data, batch_size=1000)

We can call our earlier sampler on this new dataset.

In [13]:
y_sim = sampler.sample(dummy_loader.loader)
y_sim.shape

(10000, 100)