# 0. Standard imports

In [1]:
%load_ext autoreload
%autoreload 2
#%matplotlib inline`

import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('WebAgg')
import numpy as np
import pandas as pd

In [2]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external


In [3]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi


***import ete3 Tree***

In [87]:
from ete3 import Tree

tree_name = "/home/eecs/khalil.ouardini/cas_scvi_topologies/newick_objects/500cells/high_fitness/topology1.nwk"
tree = Tree(tree_name, 1)

for i, n in enumerate(tree.traverse('levelorder')):
    n.add_features(index=i)
    n.name = str(i)
eps = 1e-3
branch_length = {}
for node in tree.traverse('levelorder'):
    if node.is_root():
        branch_length[node.name] = 0.0
    else:
        branch_length[node.name] = node.dist
branch_length['prior_root'] = 1.0

In [149]:
def convert_params_NB(mu, alpha):
    """ 
    Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports

    Parameters
    ----------
    mu : float 
       Mean of NB distribution.
    alpha : float
       Overdispersion parameter used for variance calculation.

    See https://en.wikipedia.org/wiki/Negative_binomial_distribution#Alternative_formulations
    """
    var = mu + alpha * mu ** 2
    p = (var - mu) / var
    r = mu ** 2 / (var - mu)
    return r, 1-p

Test

In [165]:
mean = nb_glm.mu[1][0]
var = mean + alpha * mean**2

r, p = convert_params_NB(mean, alpha)
nbinom.stats(r, p, moments='mv'), mean , var

((array(4.82850859), array(7.1599581)), 4.828508585155636, 7.159958100847804)

In [89]:
# Data
from anndata import AnnData
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from external.dataset.tree import TreeDataset, GeneExpressionDataset
from external.dataset.poisson_glm import Poisson_GLM
from external.dataset.anndataset import AnnDatasetFromAnnData

# Models
from models.vae import VAE
import scanpy as sc
from external.inference.tree_inference import TreeTrainer
from inference.inference import UnsupervisedTrainer
from inference import posterior
from external.models.treevae import TreeVAE

# Utils
from external.utils.data_util import get_leaves, get_internal
from external.utils.metrics import ks_pvalue, accuracy_imputation, correlations, knn_purity, knn_purity_stratified
from external.utils.plots_util import plot_histograms, plot_scatter_mean, plot_ecdf_ks, plot_density
from external.utils.plots_util import plot_losses, plot_elbo, plot_common_ancestor, plot_one_gene, training_dashboard
from external.utils.baselines import avg_weighted_baseline, scvi_baseline, scvi_baseline_z, cascvi_baseline_z, avg_baseline_z, construct_latent

In [90]:
import torch
    
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f94eeb4e9b0>

# 1. Simulations (Negative Binomial GLM)

In [155]:
d = 10
g = 1000
vis = False
leaves_only = False
var = 1.0
negative_binomial = True
# inverse dispersion parameter (inverse dispersion alpha  = 1 / theta controls how much variance we have in the simulations)
alpha = 0.1

nb_glm = Poisson_GLM(tree, g, d, vis, leaves_only, branch_length, alpha)

nb_glm.simulate_latent()

***Generate gene expression count data***

In [156]:
nb_glm.simulate_ge(False)
poisson_X = nb_glm.X
nb_glm.simulate_ge(negative_binomial)

# Quality Control (i.e Gene Filtering)
#nb_glm.gene_qc()

nb_glm.X.shape, poisson_X.shape, nb_glm.W.shape, nb_glm.beta.shape

=== Negative Binomial simulations ===


((1000, 1000), (1000, 1000), (1000, 10), (1000,))

In [238]:
len(genes_to_inspect)

108

In [240]:
import random
import seaborn as sns

n_samples = 40
#idx = random.sample(range(1, 100), n_samples)
idx = random.sample(genes_to_inspect, n_samples)

n_rows = 4
n_cols = int(n_samples / n_rows)

fig, ax = plt.subplots(n_rows, n_cols, figsize=(35, 15))

h = 0
for i in range(n_rows):
    for j in range(n_cols):
        k = idx[h]
        h += 1
        # density plots
        sns.distplot(ax=ax[i][j], a=np.log(1 + poisson_X[:, k]), hist=False,
                     kde=True, kde_kws={'shade': True}, label='Poisson')
        sns.distplot(ax=ax[i][j], a=np.log(1 + nb_glm.X[:, k]), hist=False,
                     kde=True, kde_kws={'shade': True}, label='NB')
        # set title
        ax[i][j].set_title('Gene ' + str(k))

plt.legend()
fig.suptitle("Combined gene density plots | dipsersion = {}".format(alpha))
plt.savefig('tmp.png')



***Binomial thinning***

In [166]:
print("Proportion of dropouts: {}".format(np.mean(nb_glm.X == 0)))
#glm.binomial_thinning(p=0.1)

Proportion of dropouts: 0.439189


In [167]:
print("Proportion of dropouts after Binomial thinning: {}".format(np.mean(nb_glm.X == 0)))

Proportion of dropouts after Binomial thinning: 0.439189


***Get the data and the indexes at the leaves***

In [168]:
# Latent vectors
leaves_z, _, _ = get_leaves(nb_glm.z, nb_glm.mu, tree)

#FIXED training set
leaves_X, leaves_idx, mu = get_leaves(nb_glm.X, nb_glm.mu, tree)

# internal nodes data (for imputation)
internal_X, internal_idx, internal_mu = get_internal(nb_glm.X, nb_glm.mu, tree)

leaves_X.shape, mu.shape, internal_X.shape, internal_mu.shape, leaves_z.shape

((500, 1000), (500, 1000), (500, 1000), (500, 1000), (500, 10))

# 2. Fitting CascVI

In [169]:
import scanpy as sc

# anndata + gene and celle filtering
adata = AnnData(leaves_X)
leaves = [n for n in tree.traverse('levelorder') if n.is_leaf()]
adata.obs_names = [n.name for n in leaves]
#sc.pp.filter_genes(adata, min_counts=3)
#sc.pp.filter_cells(adata, min_counts=0)

***Create a TreeDataset object***

In [170]:
# treeVAE
import copy

tree_bis = copy.deepcopy(tree)
scvi_dataset = AnnDatasetFromAnnData(adata, filtering=False)
scvi_dataset.initialize_cell_attribute('barcodes', adata.obs_names)
cas_dataset = TreeDataset(scvi_dataset, tree=tree_bis, filtering=False)

# No batches beacause of the message passing
use_cuda = True
use_MP = True
ldvae = False

go


***Initialize model***

In [171]:
treevae = TreeVAE(cas_dataset.nb_genes,
              tree = cas_dataset.tree,
              n_latent=nb_glm.latent,
              n_hidden=128,
              n_layers=1,
              reconstruction_loss='nb',
              prior_t = branch_length,
              ldvae = ldvae,
              use_MP=use_MP
             )

In [172]:
import torch

freeze = False
if freeze:
    new_weight = torch.from_numpy(glm.W).float()
    new_bias = torch.from_numpy(glm.beta).float()

    with torch.no_grad():
        treevae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
        treevae.decoder.factor_regressor.fc_layers[0][0].bias = torch.nn.Parameter(new_bias)
        
    for param in treevae.decoder.factor_regressor.fc_layers[0][0].parameters():
        param.requires_grad = False

In [173]:
#assert(treevae.decoder.factor_regressor.fc_layers[0][0].weight.numpy().all() == glm.W.T.all())
#assert(treevae.decoder.factor_regressor.fc_layers[0][0].bias.numpy().all() == glm.beta.all())

***Are we able to generate the gene expression data by decoding the simulated latent space?***

In [174]:
px_scale, px_rate, raw_px_scale = treevae.decoder(treevae.dispersion,
                                        torch.from_numpy(leaves_z).float(),
                                        torch.from_numpy(np.array([np.log(10000)])).float()
                                       )

from sklearn.metrics import mean_squared_error

if ldvae:
    foo = np.clip(a=np.exp(raw_px_scale.detach().cpu().numpy()),
            a_min=0,
            a_max=1e8
    )
    mse = mean_squared_error(mu, foo)
else:
    mse = mean_squared_error(mu, px_rate.detach().numpy())

print("the distance between the Poisson and the NB means is {}".format(mse))

the distance between the Poisson and the NB means is 106169.31102319187


***Hyperparameters***

In [175]:
n_epochs = 500
lr = 1e-3
lambda_ = 1.0

***trainer***

In [176]:
freq = 100
trainer = TreeTrainer(
    model = treevae,
    gene_dataset = cas_dataset,
    lambda_ = lambda_,
    train_size=1.0,
    test_size=0,
    use_cuda=use_cuda,
    frequency=freq,
    n_epochs_kl_warmup=120
)

train_leaves:  [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], [151], [152], [153], [154], [155], [1

***Start training***

In [177]:
trainer.train(n_epochs=n_epochs,
              lr=lr
              )

odings MP Likelihood: 42.21582896242993
ELBO Loss: 1858.5212699693311
training:  70%|███████   | 350/500 [02:16<01:00,  2.49it/s]Encodings MP Likelihood: 15.766303824747016
ELBO Loss: 1832.3281998294015
training:  70%|███████   | 351/500 [02:17<00:59,  2.51it/s]Encodings MP Likelihood: 14.74318610586404
ELBO Loss: 1830.9789094109358
training:  70%|███████   | 352/500 [02:17<00:58,  2.51it/s]Encodings MP Likelihood: 34.831736745969735
ELBO Loss: 1849.8612157242742
training:  71%|███████   | 353/500 [02:18<00:58,  2.52it/s]Encodings MP Likelihood: 34.42903706674933
ELBO Loss: 1849.713106030454
training:  71%|███████   | 354/500 [02:18<00:58,  2.49it/s]Encodings MP Likelihood: 27.027991445770517
ELBO Loss: 1842.4981227567127
training:  71%|███████   | 355/500 [02:18<00:58,  2.47it/s]Encodings MP Likelihood: 25.37585682777694
ELBO Loss: 1840.839514124826
training:  71%|███████   | 356/500 [02:19<00:57,  2.49it/s]Encodings MP Likelihood: 21.832259792878418
ELBO Loss: 1836.7246739747395
trai

***Loss Functions***

In [178]:
training_dashboard(trainer, treevae.encoder_variance)

### 3. Posterior and MV imputation

In [235]:
from sklearn.metrics import mean_squared_error

full_posterior = trainer.create_posterior(trainer.model, cas_dataset, trainer.clades,
                                indices=np.arange(len(cas_dataset))
                                         )
error = mean_squared_error(full_posterior.get_latent(), leaves_z)
print("the distance is {}".format(error))

the distance is 3.152920012749111


***Missing Value imputation By Posterior Predictive sampling***

In [137]:
empirical_l = np.mean(np.sum(nb_glm.X, axis=1))

# CascVI impitations
imputed = {}
imputed_z = {}
imputed_gt = {}

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        imputed[n.name], imputed_z[n.name] = full_posterior.imputation_internal(n,
                                                            give_mean=False,
                                                            library_size=empirical_l
                                                           )
        imputed_gt[n.name] = nb_glm.X[n.index]

In [138]:
imputed_X = [x for x in imputed.values()]
imputed_X = np.array(imputed_X).reshape(-1, cas_dataset.X.shape[1])
#plot_histograms(imputed_X, "Histogram of CasscVI imputed gene expression data")

***CascVI Baseline 1 (MP Oracle)***

regenerate data

In [197]:
from scipy.stats import gamma, nbinom

data_mu = []
data_X = []
idx = 0
for n in tree.traverse('levelorder'):
    if n.is_leaf():
        mu_z = np.clip(a=np.exp(nb_glm.W @ leaves_z[idx] + nb_glm.beta),
                        a_min=0,
                        a_max=1e8
                        )
        r, p = convert_params_NB(mu=mu_z, alpha=alpha)
        
        data_mu.append(mu_z)

        sample = np.array([nbinom.rvs(n=r, p=p) for i in range(200)])
        data_X.append(np.mean(sample, axis=0))
        idx += 1
data_X = np.array(data_X)
data_mu = np.array(data_mu)

In [198]:
import scipy.stats as stats

In [199]:
for i in range(len(data_X)):
    print(stats.spearmanr(data_X[i], leaves_X[i]))

-284)
SpearmanrResult(correlation=0.8446119491245421, pvalue=4.780025075565974e-273)
SpearmanrResult(correlation=0.8592567662095804, pvalue=8.583617783566959e-293)
SpearmanrResult(correlation=0.848581129864151, pvalue=3.432762949808551e-278)
SpearmanrResult(correlation=0.8765995318729166, pvalue=2.747e-319)
SpearmanrResult(correlation=0.8805105504177826, pvalue=0.0)
SpearmanrResult(correlation=0.8930101018011612, pvalue=0.0)
SpearmanrResult(correlation=0.876683641191424, pvalue=1.9988e-319)
SpearmanrResult(correlation=0.8673982331915169, pvalue=9.188519504884423e-305)
SpearmanrResult(correlation=0.8481854047229391, pvalue=1.13521500210057e-277)
SpearmanrResult(correlation=0.8463511513270635, pvalue=2.7765526484805353e-275)
SpearmanrResult(correlation=0.856394121424926, pvalue=9.221330820046107e-289)
SpearmanrResult(correlation=0.8912788703505311, pvalue=0.0)
SpearmanrResult(correlation=0.8930749685766158, pvalue=0.0)
SpearmanrResult(correlation=0.8624463320415937, pvalue=2.166705360557

In [202]:
from scipy.stats import gamma

# CascVI impitations
imputed_cascvi_1 = {}
imputed_cascvi_1_z ={}

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        _, imputed_cascvi_1_z[n.name] = full_posterior.imputation_internal(n,
                                                                    give_mean=False,
                                                                    library_size=empirical_l,
                                                                    known_latent=leaves_z
        )
        
        mu_z = np.clip(a=np.exp(nb_glm.W @ imputed_cascvi_1_z[n.name].cpu().numpy() + nb_glm.beta),
                        a_min=0,
                        a_max=1e8
                        )
        
        if treevae.reconstruction_loss == 'nb':
            r, p = convert_params_NB(mu=mu_z, alpha=alpha)
            sample = nbinom.rvs(n=r, p=p)
        else:
            sample = np.random.poisson(mu_z)
        imputed_cascvi_1[n.name] = np.clip(a=sample,
                                           a_min=0,
                                           a_max=1e8
                                           )


In [204]:
internal_cascvi_X = np.array([x for x in imputed_cascvi_1.values()]).reshape(-1, nb_glm.X.shape[1])

In [236]:
mean = 0
genes_to_inspect2 = []
for i in range(internal_X.shape[1]):
    corr = stats.spearmanr(internal_X[:, i], internal_cascvi_X[:, i])
    if math.isnan(corr[0]):
        continue
    if corr[0] < 0.05:
        genes_to_inspect2.append(i)
    mean += corr[0]
    print('gene {}: corr = {}'.format(i, corr[0]), type(corr[0]))
mean /= internal_X.shape[1]
mean

py.float64'>
gene 668: corr = 0.16504919191649806 <class 'numpy.float64'>
gene 669: corr = 0.1862394990256288 <class 'numpy.float64'>
gene 670: corr = 0.14828725657772526 <class 'numpy.float64'>
gene 671: corr = -0.012145626057038545 <class 'numpy.float64'>
gene 672: corr = 0.23802278440787386 <class 'numpy.float64'>
gene 673: corr = 0.112226375416395 <class 'numpy.float64'>
gene 674: corr = 0.05648985506895834 <class 'numpy.float64'>
gene 675: corr = -0.052579379058836376 <class 'numpy.float64'>
gene 676: corr = 0.6503357616642556 <class 'numpy.float64'>
gene 677: corr = -0.02461601904035586 <class 'numpy.float64'>
gene 678: corr = 0.11947083621809618 <class 'numpy.float64'>
gene 679: corr = 0.2928756647891632 <class 'numpy.float64'>
gene 680: corr = 0.28994853811166516 <class 'numpy.float64'>
gene 681: corr = 0.1843373134796058 <class 'numpy.float64'>
gene 682: corr = 0.1386315979391962 <class 'numpy.float64'>
gene 683: corr = 0.23052081872698005 <class 'numpy.float64'>
gene 684: cor

0.34484965984329646

In [232]:
mean = 0
for i in range(internal_X.shape[1]):
    corr = stats.spearmanr(internal_X[:, i], internal_avg_X[:, i])
    if math.isnan(corr[0]):
        continue
    mean += corr[0]
    print('gene {}: corr = {}'.format(i, corr[0]), type(corr[0]))
mean /= internal_X.shape[1]
mean

e 658: corr = 0.3505221035022903 <class 'numpy.float64'>
gene 659: corr = 0.691084144071372 <class 'numpy.float64'>
gene 660: corr = 0.12619034113242364 <class 'numpy.float64'>
gene 661: corr = -0.0038877590580275296 <class 'numpy.float64'>
gene 663: corr = 0.8915656506027362 <class 'numpy.float64'>
gene 664: corr = -0.019370240575750926 <class 'numpy.float64'>
gene 665: corr = 0.6049517713981974 <class 'numpy.float64'>
gene 666: corr = 0.2063003634957107 <class 'numpy.float64'>
gene 667: corr = 0.20668441464232765 <class 'numpy.float64'>
gene 668: corr = 0.16946652896658537 <class 'numpy.float64'>
gene 669: corr = 0.32108171651050893 <class 'numpy.float64'>
gene 670: corr = 0.1427071911603086 <class 'numpy.float64'>
gene 672: corr = 0.2692698310027577 <class 'numpy.float64'>
gene 673: corr = 0.284923625632443 <class 'numpy.float64'>
gene 674: corr = 0.1275140587654431 <class 'numpy.float64'>
gene 675: corr = 0.048294649492359065 <class 'numpy.float64'>
gene 676: corr = 0.7094092401925

0.41429399710496534

***CascVI Baseline 2 (Reconstruction of Averaged latent space)***

In [233]:
imputed_cascvi_2, imputed_cascvi_2_z = avg_baseline_z(tree=tree,
                                   model=treevae,
                                   posterior=full_posterior,
                                   weighted=False,
                                   n_samples_z=1,
                                   library_size=empirical_l,
                                   gaussian=False,
                                   use_cuda=False,
                                   known_latent=True,
                                   latent=np.array([leaves_z]),
                                   give_cov=False
                                  )

In [234]:
imputed_cascvi_2 = {}

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        mu_z = np.clip(a=np.exp(nb_glm.W @ imputed_cascvi_2_z[n.name][0] + nb_glm.beta),
                        a_min=0,
                        a_max=1e8
                        )
        
        if treevae.reconstruction_loss == 'nb':
            r, p = convert_params_NB(mu=mu_z, alpha=alpha)
            sample = nbinom.rvs(n=r, p=1-p)
        else:
            sample = np.random.poisson(mu_z)
        imputed_cascvi_2[n.name] = np.clip(a=sample,
                                           a_min=0,
                                           a_max=1e8
                                           )


# 4. Baselines

### Baseline 1: Unweighted Average of gene expression in Clade

The simple idea here is to impute the value of an internal node, with the (un)weighted average of the gene expression values of the leaves, taking the query internal node as the root of the subtree.

In [231]:
weighted = False
imputed_avg = avg_weighted_baseline(tree, weighted, nb_glm.X, rounding=True)

#get internal nodes
avg_X = np.array([x for x in imputed_avg.values()]).reshape(-1, nb_glm.X.shape[1])
internal_avg_X, _, _ = get_internal(avg_X, nb_glm.mu, tree)

### Baseline 2: (Un)weighted Average of decoded latent vectors, with scVI

We use the same averaging of the subtrees leaves in **Baseline 1**, only this time, the gene expression data is recovered with scVI

In [59]:
# anndata
gene_dataset = GeneExpressionDataset()
gene_dataset.populate_from_data(leaves_X)

In [60]:
import torch

n_epochs = 500
use_batches = False

vae = VAE(gene_dataset.nb_genes,
                  n_batch=cas_dataset.n_batches * use_batches,
                  n_hidden=64,
                  n_layers=1,
                  reconstruction_loss='nb',
                  n_latent=nb_glm.latent,
                  ldvae=ldvae
              )

if freeze:
    new_weight = torch.from_numpy(nb_glm.W).float()
    new_bias = torch.from_numpy(nb_glm.beta).float()

    with torch.no_grad():
        vae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
        vae.decoder.factor_regressor.fc_layers[0][0].bias = torch.nn.Parameter(new_bias)
        
    for param in vae.decoder.factor_regressor.fc_layers[0][0].parameters():
        param.requires_grad = False

In [61]:
px_scale, px_r, px_rate, px_dropout = vae.decoder.forward(vae.dispersion,
                                        torch.from_numpy(leaves_z).float(),
                                        torch.from_numpy(np.array([np.log(10000)])).float(),
                                        None
                                        )

from sklearn.metrics import mean_squared_error



if ldvae:
    foo = np.clip(a=np.exp(px_r.detach().numpy()),
            a_min=0,
            a_max=5000
    )
    mse = mean_squared_error(mu, foo)
else:
    mse = mean_squared_error(mu, px_rate.detach().numpy())

print("the distance between the Poisson and the NB means is {}".format(mse))

the distance between the Poisson and the NB means is 11221.932083575548


In [62]:
trainer_scvi = UnsupervisedTrainer(model=vae,
                              gene_dataset=gene_dataset,
                              train_size=1.0,
                              use_cuda=use_cuda,
                              frequency=10,
                              n_epochs_kl_warmup=2000
                              )

# train scVI
trainer_scvi.train(n_epochs=n_epochs, lr=1e-3) 
                                        
elbo_train_scvi = trainer_scvi.history["elbo_train_set"]
x = np.linspace(0, 100, (len(elbo_train_scvi)))
plt.plot(np.log(elbo_train_scvi), 
         label="train", color='blue',
         linestyle=':',
         linewidth=3
        )
        
plt.xlabel('Epoch')
plt.ylabel("ELBO")
plt.legend()
plt.title("Train history scVI")
plt.show()

training: 100%|██████████| 500/500 [00:06<00:00, 72.53it/s]


In [63]:
scvi_posterior = trainer_scvi.create_posterior(model=vae,
                                               gene_dataset=gene_dataset 
                                                )

error = mean_squared_error(scvi_posterior.get_latent()[0], leaves_z)
print("the distance is {}".format(error))

the distance is 3.889812574646592


***scVI Baseline 2 (Decoded Average Latent space)***

In [64]:
library_size = np.mean(np.sum(nb_glm.X, axis=1))
scvi_latent = np.array([scvi_posterior.get_latent(give_mean=False)[0] for i in range(10)])

imputed_scvi_2, imputed_scvi_2_z = scvi_baseline_z(tree,
                                        posterior=scvi_posterior,
                                        model=vae,
                                        weighted=False,
                                        n_samples_z=1,
                                        library_size=library_size,
                                        use_cuda=False
                                        )


# 5. Likelihood Ratio

In [65]:
cascvi_latent = full_posterior.get_latent()
scvi_latent = scvi_posterior.get_latent()[0]

scvi_latent.shape, cascvi_latent.shape

((100, 5), (100, 5))

In [66]:
treevae.initialize_visit()
treevae.initialize_messages(scvi_latent, cas_dataset.barcodes, scvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), scvi_latent.shape[1], False)
mp_lik_scvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of scVI encodings: ", mp_lik_scvi.item())

Likelihood of scVI encodings:  -14685.11046910444


In [67]:
treevae.initialize_visit()
treevae.initialize_messages(cascvi_latent, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of cascVI encodings: ", mp_lik_cascvi.item())

Likelihood of cascVI encodings:  -69.8083984094612


In [68]:
treevae.initialize_visit()
treevae.initialize_messages(leaves_z, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of observations: ", mp_lik_cascvi.item())

Likelihood of observations:  -203.46344429379195


In [69]:
# Likelihood ratio
lambda_ = (mp_lik_cascvi - mp_lik_scvi)
print("Likelihood Ratio:", lambda_)

Likelihood Ratio: tensor(14481.6470, dtype=torch.float64)


# 6. Evaluation

***CPM Normalization (for sample-sample correlation)***

get imputations into an array

In [70]:
internal_scvi_X_2 = np.array([x for x in imputed_scvi_2.values()]).reshape(-1, nb_glm.X.shape[1])
internal_cascvi_X = np.array([x for x in imputed_cascvi_1.values()]).reshape(-1, nb_glm.X.shape[1])
internal_cascvi_X_2 = np.array([x for x in imputed_cascvi_2.values()]).reshape(-1, nb_glm.X.shape[1])

internal_cascvi_X_2.shape, internal_cascvi_X.shape, internal_scvi_X_2.shape, imputed_X.shape, internal_avg_X.shape, internal_X.shape

((100, 100), (100, 100), (100, 100), (100, 100), (100, 100), (100, 100))

In [71]:
internal_scvi_X_2.shape

(100, 100)

In [72]:
from sklearn.preprocessing import normalize

norm_internal_X = sc.pp.normalize_total(AnnData(internal_X), target_sum=1e4, inplace=False)['X'] 
norm_scvi_X_2 = sc.pp.normalize_total(AnnData(internal_scvi_X_2), target_sum=1e4, inplace=False)['X']
norm_avg_X = sc.pp.normalize_total(AnnData(internal_avg_X), target_sum=1e4, inplace=False)['X']
norm_imputed_X = sc.pp.normalize_total(AnnData(imputed_X), target_sum=1e4, inplace=False)['X']
norm_cascvi_X = sc.pp.normalize_total(AnnData(internal_cascvi_X), target_sum=1e4, inplace=False)['X']
norm_cascvi_X_2 = sc.pp.normalize_total(AnnData(internal_cascvi_X_2), target_sum=1e4, inplace=False)['X']

norm_internal_X.shape

(100, 100)

## I. Sample-Sample Correlations

***1. Sample-Sample correlation (Without Normalization)***

We will use Scipy to compute a nonparametric rank correlation between the imputed and the groundtruth profiles. The correlation is based on the Spearman Correlation Coefficient.

In [73]:
data = {'groundtruth': internal_X.T, 'cascVI': imputed_X.T, 'scVI': internal_scvi_X_2.T,
        'Average': internal_avg_X.T , 'Avg Oracle': internal_cascvi_X_2.T,
        'MP Oracle': internal_cascvi_X.T
        }
        
df1 = correlations(data, 'None', True)
#df1.head(5)
#plt.show()

***2. Sample-Sample correlation (With ScanPy Normalization)***

In [74]:
data = {'groundtruth': norm_internal_X.T, 'cascVI': norm_imputed_X.T, 'scVI': norm_scvi_X_2.T, 
        'Average': norm_avg_X.T , 'Avg Oracle': norm_cascvi_X_2.T,
        'MP Oracle': norm_cascvi_X.T
        }

df2 = correlations(data, 'None', True)
#df2.head(5)
#plt.show()


## II. Gene-Gene Correlations

***2. Gene-Gene correlation (With Normalization)***

In [75]:
data = {'groundtruth': internal_X, 'cascVI': imputed_X, 'scVI': internal_scvi_X_2,
        'Average': internal_avg_X , 'Avg Oracle': internal_cascvi_X_2,
        'MP Oracle': internal_cascvi_X
        }

df3 = correlations(data, 'None', True)
#df3.head(5)
#plt.show()



***2. Gene-Gene correlation (With Normalization)***

In [76]:
data = {'groundtruth': norm_internal_X, 'cascVI': norm_imputed_X, 'scVI': norm_scvi_X_2, 
        'Average': norm_avg_X , 'Avg Oracle': norm_cascvi_X_2,
        'MP Oracle': norm_cascvi_X
        }

df4 = correlations(data, 'None', True)
#df4.head(5)
#plt.show()



***3. Gene-Gene correlation (With Rank Normalization)***

In [77]:
#data = {'groundtruth': norm_internal_X, 'cascVI': norm_imputed_X, 'scVI': norm_scvi_X_2, 
#        'Average': norm_avg_X , 'cascVI + Avg': norm_cascvi_X_2,
#        'MP Oracle': norm_cascvi_X
#        }

data = {'groundtruth': internal_X, 'cascVI': imputed_X, 'scVI': internal_scvi_X_2,
        'Average': internal_avg_X , 'Avg Oracle': internal_cascvi_X_2,
        'MP Oracle': internal_cascvi_X
        }
        
df5 = correlations(data, 'rank', True)
#df5.head(5)
#plt.show()



### III. Table Summary

In [78]:
columns = ["Method", "Spearman CC", "Pearson CC", "Kendall Tau"]
data = [df1, df2, df3, df4, df5]
#data = [df2, df4]

data 
tables = [[] for i in range(len(data))]

#task = ["Sample-Sample (None)", "Sample-Sample (CPM)", "Gene-Gene (None)", 
           #"Gene-Gene(CPM)", "Gene-Gene (Rank)" ]

for (df, t) in zip(data, tables):
    for m in np.unique(df.Method):
        sub_df = np.round(df[df['Method'] == m].mean(), decimals=3)
        t.append([m, sub_df['Spearman CC'], sub_df['Pearson CC'], sub_df['Kendall Tau']])
        
# Create and style Data Frames
df_table1 = pd.DataFrame(tables[0], columns=columns)
df_table2 = pd.DataFrame(tables[1], columns=columns)
df_table3 = pd.DataFrame(tables[2], columns=columns)
df_table4 = pd.DataFrame(tables[3], columns=columns)
df_table5 = pd.DataFrame(tables[4], columns=columns)

In [79]:
print(" >>> Sample-Sample | No Normalization <<<")
df_table1.head(10)

 >>> Sample-Sample | No Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.553,0.603,0.478
1,Avg Oracle,0.437,0.465,0.375
2,MP Oracle,0.439,0.501,0.376
3,cascVI,0.357,0.401,0.299
4,scVI,0.313,0.32,0.261


In [80]:
print(">>> Sample-Sample | CPM Normalization <<<")
df_table2.head(10)

>>> Sample-Sample | CPM Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.553,0.603,0.478
1,Avg Oracle,0.437,0.465,0.375
2,MP Oracle,0.439,0.501,0.376
3,cascVI,0.357,0.401,0.299
4,scVI,0.313,0.32,0.261


In [81]:
print(">>> Gene-Gene | No Normalization <<<")
df_table3.head(10)

>>> Gene-Gene | No Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.194,0.213,0.159
1,Avg Oracle,0.123,0.148,0.101
2,MP Oracle,0.124,0.13,0.102
3,cascVI,0.077,0.084,0.061
4,scVI,0.081,0.099,0.065


In [82]:
print(">>> Gene-Gene | CPM Normalization <<<")
df_table4.head(10)

>>> Gene-Gene | CPM Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.189,0.222,0.146
1,Avg Oracle,0.113,0.147,0.089
2,MP Oracle,0.115,0.135,0.091
3,cascVI,0.095,0.116,0.072
4,scVI,0.087,0.115,0.066


In [83]:
print(">>> Gene-Gene | Rank Normalization <<<")
df_table5.head(10)

>>> Gene-Gene | Rank Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.194,0.194,0.159
1,Avg Oracle,0.123,0.123,0.101
2,MP Oracle,0.124,0.124,0.102
3,cascVI,0.077,0.077,0.061
4,scVI,0.081,0.081,0.065


# 8. Latent Space Analysis

***cascVI***

In [209]:
# Merge internal nodes and leaves
#full_cascvi_latent = construct_latent(tree, cascvi_latent, imputed_z)


#print("CascVI latent space")
#plot_common_ancestor(tree,
#                     full_cascvi_latent,
#                     embedding='umap',
#                     give_labels=False
#                             )

***CascVI + avg***

In [210]:
#full_cascvi_latent_2 = construct_latent(tree, cascvi_latent, imputed_cascvi_2_z)

#print("CascVI + averaging latent space")
#plot_common_ancestor(tree,
#                     full_cascvi_latent_2,
#                     embedding='umap',
#                     give_labels=False
#                             )

***scVI***

In [211]:
# Merge internal nodes and leaves
#full_scvi_latent = construct_latent(tree, scvi_latent, imputed_scvi_2_z)

#print("scVI latent space")
#plot_common_ancestor(tree,
 #                full_scvi_latent,
 #                embedding='umap',
 #                give_labels=False
 #                   )

### k-NN purity

***LEAVES only***

In [212]:
#print("Leaves Only")
#data = {'groundtruth': leaves_z, 'scVI': scvi_latent,
#        'cascVI': cascvi_latent
#        }
#scores = knn_purity(max_neighbors=50,
#                    data=data,
#                    plot=True,
#                    save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/tmp'
#                    )

*** Internal nodes only***

In [213]:
#print("Internal nodes Only")
#internal_z, internal_idx, internal_mu = get_internal(glm.z, glm.mu, tree)
#internal_scvi_z, _, _ = get_internal(full_scvi_latent, glm.mu, tree)
#internal_cascvi_z, _, _ = get_internal(full_cascvi_latent, glm.mu, tree)
#internal_cascvi_z_2, _, _ = get_internal(full_cascvi_latent_2, glm.mu, tree)

#data = {'groundtruth': internal_z, 'scVI + avg': internal_scvi_z,
#        'cascVI': internal_cascvi_z, 'cascVI + avg': internal_cascvi_z_2
#        }

#scores = knn_purity(max_neighbors=50,
#              data=data,
#              plot=True,
#              save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/tmp/'
#              )

***Full tree***

In [214]:
#print("Full tree")
#data = {'groundtruth': glm.z, 'scVI + avg': full_scvi_latent,
#        'cascVI': full_cascvi_latent, 'cascVI + avg': full_cascvi_latent_2
#        }
#scores = knn_purity(max_neighbors=50,
#              data=data,
#              plot=True)

***Stratified k-NN purity***

In [215]:
#data = {'groundtruth': glm.z, 'scVI + avg': full_scvi_latent,
#        'cascVI': full_cascvi_latent, 'cascVI + avg': full_cascvi_latent_2
#        }

#for k in [2, 5, 10, 20, 35, 50]:
#    print("For {} neighbors".format(k))
#    if k == 10:
#        min_depth = 3
#    elif k == 20:
#        min_depth = 4
#    elif k == 35:
#        min_depth = 6
#    elif k == 50:
#        min_depth = 7
#    else:
#        min_depth = 2
#    scores = knn_purity_stratified(n_neighbors=k,
#                                   tree=tree,
#                                   data=data,
#                                   min_depth=min_depth,
#                                   plot=True)