This notebook aims to demostrate the Poisson and ZI-Poisson simulators after refactoring. The code and structure of this notebook are mostly borrowed from Langtian's NB_scale_model branch.

In [1]:
import anndata
import os
import requests

save_path = "data/example_sce.h5ad"
if not os.path.exists(save_path):
    response = requests.get("https://go.wisc.edu/69435h")
    with open(save_path, "wb") as f:
        f.write(response.content)

example_sce = anndata.read_h5ad(save_path)
example_sce

AnnData object with n_obs × n_vars = 2087 × 100
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score', 'cell_type', 'sizeFactor', 'pseudotime'
    var: 'highly_variable_genes'
    uns: 'X_name', 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'PCA', 'UMAP', 'X_pca', 'X_umap'
    layers: 'counts', 'cpm', 'logcounts', 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'

# refactored Poisson

In [2]:
from scdesigner.simulators import PoissonCopulaSimulator

Here I generated a 2000 x 20 dataset of Poisson distributions, with Poisson's lambda parameter (beta in the code) generated from a 2000 x 2 matrix (signifying the observed covariate values) and a 2 x 20 matrix (signifying the ground truth parameters of the regression model).

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import poisson

n_sample, n_gene, n_feature1 = 2000, 20, 2
X1 = np.random.normal(size=(n_sample, n_feature1)) # covariates
ground_truth = np.random.normal(size=(n_feature1, n_gene)) # feature x gene
beta = np.exp(X1 @ ground_truth) # cell x gene

# generate samples
Y = poisson(beta).rvs()
obs = pd.DataFrame(X1, columns=[f"dim{j}" for j in range(n_feature1)]) # cell x feature
adata = anndata.AnnData(X=Y, obs=obs)
adata



AnnData object with n_obs × n_vars = 2000 × 20
    obs: 'dim0', 'dim1'

In [4]:
formula = "~ dim0 + dim1 - 1"
poisson = PoissonCopulaSimulator()
poisson.fit(adata, formula)

  return self.list[idx]
  return self.list[idx]
                                              

In [5]:
print("Ground Truth beta:")
display(pd.DataFrame(ground_truth))
print("Estimated beta:")
display(poisson.params['coef_beta'])

Ground Truth beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.567189,-2.139439,0.227551,0.495653,0.464119,1.254901,-0.642222,0.091224,0.201224,-0.790267,-0.085941,0.799198,0.002675,0.915683,1.336518,-0.810093,0.325774,-0.428516,0.881208,0.072121
1,0.42731,1.139253,0.649619,-0.792487,-1.989476,0.202778,-0.663714,1.424353,1.226727,0.63277,1.633072,0.543872,0.455047,0.677969,0.946682,-0.798502,-0.326428,-0.04062,-0.249897,-0.27052


Estimated beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
dim0,0.568801,-2.142003,0.223254,0.486679,0.457224,1.260305,-0.66914,0.101879,0.205732,-0.777761,-0.085783,0.809493,0.003615,0.918478,1.333978,-0.80497,0.339967,-0.410249,0.874604,0.052718
dim1,0.412215,1.142668,0.657235,-0.794752,-1.990061,0.188855,-0.643146,1.427034,1.217932,0.621117,1.63118,0.533245,0.435854,0.640304,0.958275,-0.776395,-0.312262,-0.043483,-0.272755,-0.291381


# Bernoulli


In [6]:
from scipy.stats import bernoulli
from scipy.special import expit

n_sample, n_gene, n_feature1 = 2000, 20, 2
X1 = np.random.normal(size=(n_sample, n_feature1)) # covariates
ground_truth = np.random.normal(size=(n_feature1, n_gene)) # feature x gene
beta = expit(X1 @ ground_truth) # cell x gene


# generate samples
Y = bernoulli(beta).rvs()
obs = pd.DataFrame(X1, columns=[f"dim{j}" for j in range(n_feature1)]) # cell x feature
adata = anndata.AnnData(X=Y, obs=obs)




In [7]:
from scdesigner.simulators import BernoulliCopulaSimulator
formula = "~ dim0 + dim1 - 1"
bsim = BernoulliCopulaSimulator()
bsim.fit(adata, formula)

  return self.list[idx]
                                                       

In [8]:
print("Ground Truth:")
display(pd.DataFrame(ground_truth))
print("Estimated:")
display(bsim.params['coef_mean'])

Ground Truth:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.220167,0.846249,-0.128015,1.162624,-0.219886,-0.790434,0.444649,1.366132,0.769091,-0.359039,-0.22495,0.350348,-2.423354,0.408112,1.558888,-0.4434,-0.116224,1.168938,-0.041745,-1.452476
1,0.463638,0.18053,-0.574917,1.631063,0.430283,0.497908,-1.019142,1.763753,-0.433977,-0.608735,-0.370417,0.70049,0.555582,-0.630534,-0.318239,1.073938,0.889436,1.656015,-0.083154,-1.597933


Estimated:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
dim0,0.237173,0.874,-0.089654,1.226347,-0.18526,-0.855331,0.469926,1.309541,0.763371,-0.341741,-0.226882,0.353803,-2.406856,0.317157,1.465979,-0.434034,-0.111993,1.117183,0.035376,-1.31322
dim1,0.487139,0.231188,-0.663253,1.512669,0.432835,0.465811,-1.102633,1.602246,-0.384465,-0.62583,-0.41491,0.639885,0.594112,-0.649678,-0.220109,1.080205,0.912056,1.508803,-0.076651,-1.565464


# refactored ZI-Poisson

In [25]:
from scdesigner.simulators import ZeroInflatedPoissonRegressionSimulator

I followed similar steps to generate a zero-inflated poisson distribution, allowing covariates for the zero inflation term.

In [26]:
from scipy.stats import poisson, bernoulli

n_sample, n_gene, n_feature1, n_feature2 = 2000, 20, 2, 3
X1 = np.random.normal(size=(n_sample, n_feature1)) # beta covariates
X2 = np.random.normal(size=(n_sample, n_feature2)) # zero-inflation covariates
gt_beta = np.random.normal(size=(n_feature1, n_gene))
gt_pi = np.random.normal(size=(n_feature2, n_gene))
beta = np.exp(X1 @ gt_beta)
pi = 1 / (1 + np.exp(-(X2 @ gt_pi)))

# generate samples
Y = poisson(beta).rvs() * bernoulli(1 - pi).rvs()
obs1 = pd.DataFrame(X1, columns=[f"beta_dim{j}" for j in range(n_feature1)])
obs2 = pd.DataFrame(X2, columns=[f"pi_dim{j}" for j in range(n_feature2)])
obs = pd.concat([obs1, obs2], axis=1)
adata = anndata.AnnData(X=Y, obs=obs)
adata



AnnData object with n_obs × n_vars = 2000 × 20
    obs: 'beta_dim0', 'beta_dim1', 'pi_dim0', 'pi_dim1', 'pi_dim2'

In [27]:
formula = {"beta": "~ beta_dim0 + beta_dim1 - 1",
           "pi": "~ pi_dim0 + pi_dim1 + pi_dim2 - 1"}
zip = ZeroInflatedPoissonRegressionSimulator()
zip.fit(adata, formula)

                                                           

In [None]:
print("Ground Truth beta:")
display(pd.DataFrame(gt_beta))
print("Estimated beta:")
display(zip.params['coef_beta'])

Ground Truth beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.328264,0.31806,0.70731,1.329329,1.266335,-1.30008,-1.317339,1.445663,-0.412655,0.56419,-0.110915,0.972028,0.600758,-1.003597,-2.499753,1.021795,0.34494,-0.87675,0.351305,-0.575208
1,-0.090581,0.454531,0.637592,-1.076162,-0.652733,2.346174,0.15612,-2.333265,0.736831,-0.361889,-0.447244,2.605845,-0.78203,1.544422,1.366575,-1.008445,0.185617,0.073529,0.407367,1.585679


Estimated beta:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
beta_dim0,1.308238,0.340782,0.707193,1.327679,1.248926,-1.32655,-1.320128,1.443708,-0.406052,0.566786,-0.158565,0.991256,0.594288,-0.974494,-2.503642,1.010326,0.378336,-0.861138,0.362471,-0.584551
beta_dim1,-0.077868,0.43607,0.601618,-1.085027,-0.662585,2.370426,0.12136,-2.343656,0.756128,-0.363593,-0.435504,2.603594,-0.792759,1.554054,1.377665,-1.0439,0.147331,0.07593,0.348756,1.579615


In [30]:
print("Ground Truth pi:")
display(pd.DataFrame(gt_pi))
print("Estimated pi:")
display(zip.params['pi'])

Ground Truth pi:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.035895,0.367468,-2.329087,-1.87635,-0.217138,-0.557391,0.89243,-0.302674,-0.110305,-0.717205,-0.347401,-1.209527,0.592042,-0.640965,-1.484189,0.89457,-0.271283,0.520284,-0.172755,-0.955726
1,-2.658117,0.198055,-1.562535,-0.240703,0.2975,0.893082,1.655644,1.184656,-0.652397,-0.796946,-1.3107,-1.208023,-1.429588,1.629185,-1.476242,0.828804,0.044481,-2.639595,1.776848,1.122439
2,1.326346,-0.555825,-1.617209,-0.033429,-1.17934,0.161506,-1.648336,-1.485871,1.408894,0.859612,-0.371478,-1.471933,-0.38602,-0.277394,0.026814,-0.192952,-1.388178,-0.934867,0.899129,0.4509


Estimated pi:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
pi_dim0,0.036833,0.289361,-2.178336,-1.854016,-0.125733,-0.502963,0.827902,-0.346042,-0.055731,-0.669299,-0.347161,-1.13047,0.605925,-0.554687,-1.662915,0.842305,-0.142841,0.470087,-0.18977,-0.89947
pi_dim1,-2.881585,0.32253,-1.72777,-0.15376,0.32117,0.873324,1.536936,1.265749,-0.603561,-0.848558,-1.356251,-1.248333,-1.310708,1.78673,-1.498806,0.766494,0.166315,-2.338479,1.892447,1.206181
pi_dim2,1.326664,-0.520421,-1.631408,0.065764,-0.984007,0.183893,-1.538696,-1.547245,1.40459,0.879588,-0.445387,-1.518619,-0.328194,-0.218008,0.245575,-0.226454,-1.337345,-0.763791,0.792584,0.432909


Some of the pi's are not well estimated.

# Single cell example

Here I also show an example of fitting ZI-Poisson on real single cell data.

In [32]:
from scdesigner.diagnose.plot import compare_umap

In [31]:
formula = {"beta": "~ bs(pseudotime, degree=5)",
           "pi": "~ bs(pseudotime, degree=5)"}
p = ZeroInflatedPoissonRegressionSimulator()
p.fit(example_sce, formula)

                                                           

In [33]:
samples = p.sample(example_sce.obs)
compare_umap(example_sce, samples, color="pseudotime")

  return real_.concatenate(simulated_, join="outer", batch_key=None)


# Notes

I think we can create a general function for reformatting the formula input (currently now it’s defined separately in the negbin estimator file, but other files can also benefit from it).

It would also be nice if the model.param keys and the keys of dictionary used to calculate the likelihood are consistent and if people can retrieve the model matrix from the model. it will also help the calculation of AIC/BIC.