In [None]:
from scdesigner.margins.marginal import NB
from scdesigner.simulator import scdesigner
import anndata
import numpy as np
import pandas as pd
import time

In [None]:
config = 1

First we read in the settings for the current run. This stores how many cells/genes we will be running with.

In [None]:
np.random.seed(config)
configurations = pd.read_csv("data/scalability_configurations.csv")
n_cell, n_gene, replicate = configurations.iloc[config, :].values
n_cell = int(n_cell)
n_gene = int(n_gene)

Next, we define a random sample of cells/genes for this run. The subset is saved into a temporary H5AD dataset on disk.

In [None]:

sce = anndata.read_h5ad("data/million_cells.h5ad", backed=True)
total_cell, total_gene = sce.shape
cell_ix = np.random.choice(total_cell, n_cell, replace=False)
gene_ix = np.random.choice(total_gene, n_gene, replace=False)
sce[cell_ix, gene_ix].copy(filename="subset_tmp.h5ad")
sce = anndata.read_h5ad("subset_tmp.h5ad", backed=True)
sce

We can now time our simulator.

In [None]:
start = time.time()
sim = scdesigner(sce, NB("~ cell_type + `CoVID-19 severity`"), multivariate=None, max_epochs=5, lr=1e-2)
delta = time.time() - start

This is just a sanity check that the simulator result seems reasonable.

In [None]:
sim.predict(sce.obs.iloc[:10, :])["mu"]

We now save the compute time required for this run.

In [None]:
pd.DataFrame({
    "n_gene": n_gene,
    "n_cell": n_cell,
    "replicate": replicate,
    "seconds": delta
}, index=[0]).to_csv(f"scdesigner_timing_{config}.csv")


The block below is used to save a subset of this data into a format that we can read from R. The default reading functions from the zellkonverter and anndata packages run out of memory for these data.

In [None]:
# from scipy import sparse

# sce = anndata.read_h5ad("data/million_cells.h5ad", backed=True)
# inmem_data = sce[:500000, :20000].to_memory()
# inmem_data.write_csvs("data/million_cells")
# sparse.save_npz("data/million_cells/X.npz", sparse.csr_matrix(inmem_data.X))

In [None]:
# Convert to .mat in R
#
# library(reticulate)
# library(Matrix)
#scipy_sparse = import("scipy.sparse")
#X = scipy_sparse$load_npz("X.npz")
#writeMM(X, "X.mat")