This notebook aims to demostrate the Poisson and ZI-Poisson simulators after refactoring. The code and structure of this notebook are mostly borrowed from Langtian's NB_scale_model branch.

In [1]:
import anndata
import os
import requests

save_path = "data/example_sce.h5ad"
if not os.path.exists(save_path):
    response = requests.get("https://go.wisc.edu/69435h")
    with open(save_path, "wb") as f:
        f.write(response.content)

example_sce = anndata.read_h5ad(save_path)
example_sce

AnnData object with n_obs × n_vars = 2087 × 100
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score', 'cell_type', 'sizeFactor', 'pseudotime'
    var: 'highly_variable_genes'
    uns: 'X_name', 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'PCA', 'UMAP', 'X_pca', 'X_umap'
    layers: 'counts', 'cpm', 'logcounts', 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'

# refactored Poisson

In [2]:
from scdesigner.simulators import PoissonCopulaSimulator

Here I generated a 2000 x 20 dataset of Poisson distributions, with Poisson's lambda parameter (beta in the code) generated from a 2000 x 2 matrix (signifying the observed covariate values) and a 2 x 20 matrix (signifying the ground truth parameters of the regression model).

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import poisson

n_sample, n_gene, n_feature1 = 2000, 20, 2
X1 = np.random.normal(size=(n_sample, n_feature1)) # covariates
ground_truth = np.random.normal(size=(n_feature1, n_gene)) # feature x gene
beta = np.exp(X1 @ ground_truth) # cell x gene

# generate samples
Y = poisson(beta).rvs()
obs = pd.DataFrame(X1, columns=[f"dim{j}" for j in range(n_feature1)]) # cell x feature
adata = anndata.AnnData(X=Y, obs=obs)
adata

AnnData object with n_obs × n_vars = 2000 × 20
    obs: 'dim0', 'dim1'

In [4]:
formula = "~ dim0 + dim1 - 1"
poisson = PoissonCopulaSimulator()
poisson.fit(adata, formula)

                                                           

In [5]:
print("Ground Truth beta:")
display(pd.DataFrame(ground_truth))
print("Estimated beta:")
display(poisson.params['coef_mean'])

Ground Truth beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-1.457637,0.473093,-2.375938,-0.715722,-0.668807,-0.301558,-0.572261,0.438207,1.091882,0.852627,0.752666,-0.202346,-0.773482,-0.436657,-0.852744,-0.105124,-0.702125,1.663763,-0.281323,0.367465
1,-0.597341,-0.026651,-1.342446,-0.507784,-0.93732,-0.778427,1.307672,0.657107,-0.398077,0.778041,-0.268499,0.613406,0.258946,-0.703515,-0.722973,-0.938651,1.262145,-1.253431,-0.962454,-0.021301


Estimated beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
dim0,-1.446899,0.486716,-2.371704,-0.69507,-0.65369,-0.310306,-0.553706,0.436296,1.085554,0.830238,0.762741,-0.182738,-0.759636,-0.465367,-0.858293,-0.069243,-0.727111,1.672704,-0.287866,0.397548
dim1,-0.601235,-0.046075,-1.34929,-0.513371,-0.966517,-0.773119,1.304932,0.634915,-0.424838,0.77965,-0.255339,0.639833,0.269473,-0.730367,-0.715157,-0.956658,1.263556,-1.262061,-0.969022,-0.019131


# Bernoulli


In [6]:
from scipy.stats import bernoulli
from scipy.special import expit

n_sample, n_gene, n_feature1 = 2000, 20, 2
X1 = np.random.normal(size=(n_sample, n_feature1)) # covariates
ground_truth = np.random.normal(size=(n_feature1, n_gene)) # feature x gene
beta = expit(X1 @ ground_truth) # cell x gene


# generate samples
Y = bernoulli(beta).rvs()
obs = pd.DataFrame(X1, columns=[f"dim{j}" for j in range(n_feature1)]) # cell x feature
adata = anndata.AnnData(X=Y, obs=obs)


In [7]:
from scdesigner.simulators import BernoulliCopulaSimulator
formula = "~ dim0 + dim1 - 1"
bsim = BernoulliCopulaSimulator()
bsim.fit(adata, formula)

                                                         

In [8]:
print("Ground Truth:")
display(pd.DataFrame(ground_truth))
print("Estimated:")
display(bsim.params['coef_mean'])

Ground Truth:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.504394,0.926479,-1.565664,-1.265588,-0.075571,-1.032423,-1.109722,0.271857,-0.916036,2.257657,-0.533166,-0.270232,0.297541,-1.507955,0.508505,0.341468,-0.349306,0.273927,0.428283,1.435191
1,2.500261,-1.061501,-1.297347,0.10275,-2.078918,0.649415,0.113081,0.367287,-0.303572,0.255016,0.47224,0.200023,-1.345099,0.172006,1.853965,0.240892,0.137056,1.560038,-0.745616,0.889678


Estimated:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
dim0,0.513092,0.98012,-1.658661,-1.42327,-0.065371,-1.05516,-1.028003,0.368019,-0.919763,2.38074,-0.543697,-0.277377,0.295623,-1.472326,0.483131,0.280151,-0.309569,0.288057,0.402537,1.374623
dim1,2.456957,-1.164703,-1.359284,0.035629,-2.06834,0.628883,0.0824,0.45458,-0.336684,0.262866,0.526108,0.192268,-1.360505,0.156137,1.810719,0.313814,0.1087,1.518964,-0.667818,0.811329


# refactored ZI-Poisson

In [9]:
from scdesigner.simulators import ZeroInflatedPoissonRegressionSimulator

I followed similar steps to generate a zero-inflated poisson distribution, allowing covariates for the zero inflation term.

In [10]:
from scipy.stats import poisson, bernoulli

n_sample, n_gene, n_feature1, n_feature2 = 2000, 20, 2, 3
X1 = np.random.normal(size=(n_sample, n_feature1)) # beta covariates
X2 = np.random.normal(size=(n_sample, n_feature2)) # zero-inflation covariates
gt_beta = np.random.normal(size=(n_feature1, n_gene))
gt_pi = np.random.normal(size=(n_feature2, n_gene))
beta = np.exp(X1 @ gt_beta)
pi = 1 / (1 + np.exp(-(X2 @ gt_pi)))

# generate samples
Y = poisson(beta).rvs() * bernoulli(1 - pi).rvs()
obs1 = pd.DataFrame(X1, columns=[f"beta_dim{j}" for j in range(n_feature1)])
obs2 = pd.DataFrame(X2, columns=[f"pi_dim{j}" for j in range(n_feature2)])
obs = pd.concat([obs1, obs2], axis=1)
adata = anndata.AnnData(X=Y, obs=obs)
adata

AnnData object with n_obs × n_vars = 2000 × 20
    obs: 'beta_dim0', 'beta_dim1', 'pi_dim0', 'pi_dim1', 'pi_dim2'

In [11]:
formula = {"mean": "~ beta_dim0 + beta_dim1 - 1",
           "zero_inflation": "~ pi_dim0 + pi_dim1 + pi_dim2 - 1"}
zip = ZeroInflatedPoissonRegressionSimulator()
zip.fit(adata, formula)

                                                          

In [12]:
print("Ground Truth beta:")
display(pd.DataFrame(gt_beta))
print("Estimated beta:")
display(zip.params['coef_mean'])

Ground Truth beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.510918,1.642521,2.115776,0.456013,-0.088574,1.316247,1.167165,0.881811,-1.57934,1.208175,0.358201,-0.893702,-0.671876,0.582032,-2.511639,-0.125473,-0.655063,-0.756941,-0.457193,-1.4794
1,1.05248,-2.127895,0.169288,-0.847558,1.983579,0.022686,-2.123333,0.546821,-1.092577,0.984889,0.617723,1.472595,0.187032,0.819149,0.042754,-0.589654,1.065755,-1.230484,-0.036307,-0.568566


Estimated beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
beta_dim0,0.548159,1.621609,2.147494,0.466696,-0.089456,1.338399,1.165035,0.872852,-1.58946,1.202023,0.312271,-0.891085,-0.660091,0.543929,-2.510422,-0.105151,-0.650828,-0.745797,-0.486733,-1.488417
beta_dim1,1.049919,-2.107595,0.16149,-0.850224,1.986335,0.043255,-2.126538,0.512441,-1.081964,0.976703,0.627628,1.468275,0.171117,0.829353,0.04829,-0.575008,1.055092,-1.227441,-0.05184,-0.575623


In [13]:
print("Ground Truth pi:")
display(pd.DataFrame(gt_pi))
print("Estimated pi:")
display(zip.params['coef_mean'])

Ground Truth pi:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.842094,-0.946681,-0.408833,0.043914,1.487723,-0.532516,-0.400892,-1.348202,-0.982995,0.412981,0.032069,0.048103,1.741215,-1.672278,2.317822,1.178275,-0.325793,0.391233,-0.682845,0.319233
1,2.149843,-0.646288,-0.232006,-0.756766,-0.596114,-0.402694,-1.094886,-0.263612,0.054232,-0.40315,-2.299095,-0.154318,-0.047272,1.897542,0.026819,0.935586,-0.4257,1.298174,-0.944543,0.504989
2,0.840365,0.693606,2.16415,-0.584365,1.413783,1.155645,0.049466,1.839261,-0.146558,0.022317,-0.005484,-1.273186,0.761514,0.605398,-0.104679,1.762535,-0.43722,-1.729797,1.902206,1.011109


Estimated pi:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
beta_dim0,0.548159,1.621609,2.147494,0.466696,-0.089456,1.338399,1.165035,0.872852,-1.58946,1.202023,0.312271,-0.891085,-0.660091,0.543929,-2.510422,-0.105151,-0.650828,-0.745797,-0.486733,-1.488417
beta_dim1,1.049919,-2.107595,0.16149,-0.850224,1.986335,0.043255,-2.126538,0.512441,-1.081964,0.976703,0.627628,1.468275,0.171117,0.829353,0.04829,-0.575008,1.055092,-1.227441,-0.05184,-0.575623


Some of the pi's are not well estimated.

# Single cell example

Here I also show an example of fitting ZI-Poisson on real single cell data.

In [14]:
from scdesigner.diagnose.plot import compare_umap

In [15]:
formula = {"mean": "~ bs(pseudotime, degree=5)",
           "zero_inflation": "~ bs(pseudotime, degree=5)"}
p = ZeroInflatedPoissonRegressionSimulator()
p.fit(example_sce, formula)

                                                          

In [16]:
samples = p.sample(example_sce.obs)
compare_umap(example_sce, samples, color="pseudotime")

# Notes

I think we can create a general function for reformatting the formula input (currently now it’s defined separately in the negbin estimator file, but other files can also benefit from it).

It would also be nice if the model.param keys and the keys of dictionary used to calculate the likelihood are consistent and if people can retrieve the model matrix from the model. it will also help the calculation of AIC/BIC.