This block keeps track of experiment parameters.

In [None]:
n_de_genes = 50
n_top_genes = 200

In [None]:
import anndata
import urllib.request

# Load data
url = "https://drive.google.com/uc?export=download&id=1aZpy_l9xDQm5s0g17ztuizAFNbp5PX4L"
path, _ = urllib.request.urlretrieve(url)
adata = anndata.read_h5ad(path)

First we preprocess the data following `DE.Rmd` and filter to just the B or T cells.

In [None]:
import scanpy as sc

# Preprocess data
log_counts = sc.pp.log1p(adata, copy=True)
sc.pp.highly_variable_genes(log_counts, n_top_genes=n_top_genes)
adata = adata[:, log_counts.var['highly_variable']]

# Extract B cells and regulatory T cells
selected_cells = adata.obs['phenoid'].isin(['b.cells', 'regulatory.t'])
adata = adata[selected_cells]
adata.obs['cell_type'] = adata.obs['phenoid'].astype('category')

Next we can define the original and synthetic control simulator.

In [None]:
from scdesigner.simulator import scdesigner
from scdesigner.margins.marginal import NB
from scdesigner.transform import nullify

sim = scdesigner(adata, NB("~ cell_type"))
means = sim.parameters("mu")
de_genes = set(means["cell_type[T.regulatory.t]"].abs().nlargest(n_de_genes).index)
non_de_genes = list(adata.var_names.difference(de_genes))
null_sim = nullify(sim, "cell_type", non_de_genes)

Now we can run the power analysis using the same method described in `DE.Rmd`. The important part here is that we can sample from the control simulator. For now, we'll just test this with a single method.

In [None]:
test_methods = ["wilcoxon"]
q_values = {}

# Extract p-values and adjust for multiple testing
sim_data = sc.pp.log1p(null_sim.sample(), copy=True)
sc.tl.rank_genes_groups(sim_data, 'cell_type', method='wilcoxon')
q_values["wilcoxon"] = sc.get.rank_genes_groups_df(sim_data, "regulatory.t")

This populates the FDR and power across a range of q-value thresholds.

In [None]:
import numpy as np
import pandas as pd

target_fdr = np.concatenate([np.arange(0.01, 0.11, 0.01), np.arange(0.2, 0.6, 0.1)])
fdp_mat = pd.DataFrame(index=target_fdr, columns=test_methods)
power_mat = pd.DataFrame(index=target_fdr, columns=test_methods)

for test in test_methods:
    curr_p = q_values[test]
    for fdr in target_fdr:
        discoveries = curr_p[curr_p.pvals_adj <= fdr].names
        tp = len(de_genes.intersection(discoveries))
        fdp_mat.loc[fdr, test] = (len(discoveries) - tp) / len(discoveries) if len(discoveries) > 0 else 0
        power_mat.loc[fdr, test] = tp / len(de_genes)