In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import seaborn as sns
from scipy.sparse import csr_matrix
from scipy.stats import rankdata
import pickle
import statistics as stat

  from pandas.core.index import RangeIndex


Load each lineage

In [2]:
endo = sc.read_h5ad("../data/endo.dpt.h5ad")
hep = sc.read_h5ad("../data/hep.dpt.h5ad")
neur = sc.read_h5ad("../data/neur.dpt.h5ad")
lineages = [endo, hep, neur]
lineage_names = ["endo", "hep", "neur"]

In [30]:
test = np.array([[1, 2, 3],[4, 5, 6]])
test.flatten()

array([1, 2, 3, 4, 5, 6])

In [5]:
def aggregate(sc_sub):
    """
    input: sc_sub, a scanpy h5ad object, [cells x genes]
    output: mat, a csr matrix, [1 x genes], summed over cells
    """
    mat = csr_matrix.sum(sc_sub.X, 0)
    return(mat)

In [6]:
def log_normalize(mat):
    """
    input: mat, a csr matrix, [1 x genes], containing raw counts
    output: logcpm, a csr matrix, [1 x genes], = ln(cpm + 1)
    """
    cpm = mat * 1e6 / csr_matrix.sum(mat)
    logcpm = cpm.log1p()
    return(logcpm)

In [28]:
def scale(mat, center=True):
    """
    input: mat, a [levels x gene] csr matrix
           center, boolean indicating whether or not you want to center the data
    output: scaled_mat, a [levels x gene] csr matrix where each gene has zero mean, unit variance across all levels
    """
    if center:
        mat = mat - np.mean(mat, 0)
    stdevs = np.std(mat, 0)
    stdevs[stdevs == 0] = 1
    scaled_mat = mat / stdevs
    return(scaled_mat)

In [32]:
endo.var_names[1]

'NOC2L'

In [None]:
def get_pseudobulk(sc_obj, stratifier="individual", pt_label="dpt_pseudotime", nbins=5):
    """
    input: sc_obj, a full h5ad object
           stratifier, the string identifier for the category over which pseudobulk will be aggregated (along with pseudotime)
           pt_label, the string identifier for pseudotime, ie 'dpt_pseudotime'
           nbins, the number of pseudotime bins to compute
    output: Y, an [NGT, 1] array where N=number of levels to stratifier, G=num genes, T=nbins
            X, an [NGT, 3] matrix (N,G,T as above), where the first column is time, second column is level, third is gene
            leveldict, a dictionary mapping ints in second column of X to unique values in sc_obj.obs[stratifier]
            genedict, a dictionary mapping ints in third column of X to gene names in sc_obj
            cellcounts, 
    """
    levels = np.unique(sc_obj.obs[stratifier])
    bin_width = 1/nbins
    nL = len(levels)
    nG = sc_obj.X.shape[1]
    pseudobulk = np.zeros([nbins*nL, nG])
    X = np.zeros([nbins*nL*nG, 3])
    cellcounts = np.zeros(nbins*nL, dtype=int)
    for t in range(nbins):
        if t < nbins-1:
            t_subset = (t*bin_width <= sc_obj.obs[pt_label]) & (sc_obj.obs[pt_label] < (t+1)*bin_width)
        else:
            t_subset = (t*bin_width <= sc_obj.obs[pt_label]) & (sc_obj.obs[pt_label] <= (t+1)*bin_width)
        for l in range(nL):
            l_subset = sc_obj.obs[stratifier] == levels[l]
            sub = sc_obj[l_subset & t_subset]
            # get cell counts for this subset
            cellcounts[t*nL + l] = sub.X.shape[0]
            # get metadata, store in X
            t_sub = np.median(sub.obs[pt_label])
            X[(t*nL*nG+l*nG):(t*nL*nG+(l+1)*nG),0] = t_sub
            X[(t*nL*nG+l*nG):(t*nL*nG+(l+1)*nG),1] = l
            X[(t*nL*nG+l*nG):(t*nL*nG+(l+1)*nG),2] = range(nG)
            # get expression data, store in pseudobulk
            sub_sum = aggregate(sub)
            sub_cpm = log_normalize(sub)
            pseudobulk[(len(levels) * nbins) + l, :] = sub_cpm
    pseudobulk = scale(pseudobulk)
    Y = pseudobulk.flatten()
    leveldict = {i:levels[i] for i in range(nL)}
    genedict = {i:sc_obj.var_names[i] for i in range(nG)}
    return(Y, X, leveldict, genedict, cellcounts)

Grab the parameters that hold across all lineages

In [11]:
indivs = np.unique(endo.obs['individual'])
batches = np.unique(endo.obs['Batch'])
n_inds = 3
n_batches = 3
n_bins = 5
_, nG = endo.X.shape
nC_ind = n_inds * n_bins
nC_batch = n_batches * n_bins

Aggregate pseudobulk and log normalize for each lineage, for each individual or batch, and for each diffusion pseudotime bin

In [None]:
for l in range(len(lineages)):
    lin = lineages[l]
    lin_name = lineage_names[l]
    counts = lin.X
    t_ind = np.zeros(nC_ind)
    t_batch = np.zeros(nC_batch)
    ind = []
    batch = []
    numcells_ind = np.zeros(nC_ind)
    numcells_batch = np.zeros(nC_batch)
    Y_ind = np.zeros([nC_ind, nG])
    Y_batch = np.zeros([nC_batch, nG])
    bin_w = 1/n_bins
    for t_bin in range(n_bins):
        time_subset = (t_bin*bin_w <= lin.obs['dpt_pseudotime']) & (lin.obs['dpt_pseudotime'] < (t_bin+1)*bin_w)
        for i in range(n_inds):
            c = (n_inds * t_bin) + i
            ind_subset = (lin.obs['individual'] == indivs[i])
            lin_c = lin[time_subset & ind_subset]
            # log normalize
            exp_c = csr_matrix.sum(lin_c.X, 0)/csr_matrix.sum(lin_c.X)
            exp_c = np.log(10**4 * exp_c + 1)
            numcells_ind[c] = lin_c.shape[0]
            Y_ind[int(c),:] = exp_c
            t_ind[int(c)] = np.median(lin_c.obs['dpt_pseudotime'])
            ind.append(indivs[i])
        for b in range(n_batches):
            c = (n_batches * t_bin) + b
            batch_subset = (lin.obs['Batch'] == batches[b])
            lin_c = lin[time_subset & batch_subset]
            exp_c = csr_matrix.sum(lin_c.X, 0)/csr_matrix.sum(lin_c.X)
            exp_c = np.log(10**4 * exp_c + 1)
            numcells_batch[c] = lin_c.shape[0]
            Y_batch[int(c),:] = exp_c
            t_batch[int(c)] = np.median(endo_c.obs['dpt_pseudotime'])
            batch.append(batches[b])
    ind = np.array(ind)
    batch = np.array(batch)