In [None]:
from dataset import base_genes
import numpy as np
from scipy.stats import nbinom

In [None]:
n_cells=10000
n_genes=5
target_zero_prob=.8

In [112]:
x, mu_values, theta_values, pi_values = base_genes(n_cells=n_cells, 
                                                    n_genes=n_genes, 
                                                    target_zero_prob=target_zero_prob, 
                                                    mu_bot=1.0, 
                                                    mu_top=10, 
                                                    theta_bot=0.5, 
                                                    theta_top=2.0,
                                                    seed=42)

Actual overall zero proportion: 0.8007


In [113]:
np.random.seed(42)
mean_weights = np.random.uniform(-2, +2, n_genes)
mu = np.average(x, axis=1, weights=mean_weights) #mean of ZINB
mu = np.clip(mu, 0.1, None)

np.random.seed(11)
mean_dispersion = np.random.uniform(-2, +2, n_genes)
theta = np.average(x, axis=1, weights=mean_weights) #dispersion of ZINB
theta = np.clip(theta, 0.1, None)

In [114]:
r = 1 / theta  

# Scipy-compatible NB probability
p = r / (r + mu)

# Compute probability of zero from NB
p_nb_zero = nbinom.pmf(0, r, p)

# Solve for pi to match target overall zero probability
pi = (target_zero_prob - p_nb_zero) / (1 - p_nb_zero)
pi = np.clip(pi, 0, 1)  # Ensure it's within [0,1]

# Simulate NB samples
nb_samples = nbinom.rvs(r, p, size=n_cells)

# Apply zero-inflation
zero_mask = np.random.binomial(1, pi, size=n_cells)
nb_samples[zero_mask == 1] = 0

x_comb = nb_samples
x_comb.shape

(10000,)

In [116]:
tile_up = lambda x : np.tile(x, (n_cells, 1))
mu_values, theta_values, pi_values = [tile_up(arr) for arr in [mu_values, theta_values, pi_values]]

In [125]:
stack_up = lambda x,y : np.hstack((x,y.reshape((n_cells,1))))
pairs = [(x, x_comb), (mu_values, mu), (theta_values, theta), (pi_values, pi)]
x, mu_values, theta_values, pi_values = [stack_up(x,y) for x,y in pairs]