# 0. Standard imports

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import copy
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('WebAgg')
import numpy as np
import pandas as pd

In [1]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external


In [5]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi


In [6]:
%reload_ext autoreload
%matplotlib inline

***import ete3 Tree***

In [7]:
from ete3 import Tree

tree_name = "/home/eecs/khalil.ouardini/cas_scvi_topologies/newick_objects/100cells/no_fitness/topology4.nwk"
tree = Tree(tree_name, 1)

leaves = tree.get_leaves()

for i, n in enumerate(tree.traverse('levelorder')):
    n.add_features(index=i)
    n.name = str(i)

In [8]:
# Data
from anndata import AnnData
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from external.dataset.tree import TreeDataset, GeneExpressionDataset
from external.dataset.ppca import PPCA
from external.dataset.anndataset import AnnDatasetFromAnnData

# Models
import scanpy as sc
from external.inference.gaussian_inference import GaussianTrainer
from external.inference.gaussian_tree_inference import GaussianTreeTrainer
from external.inference.gaussian_tree_inference import GaussianTreePosterior
from external.inference.gaussian_inference import GaussianPosterior
from external.models.treevae import TreeVAE
from external.models.gaussian_vae import GaussianVAE
from external.models.gaussian_treevae import GaussianTreeVAE

# Utils
from external.utils.data_util import get_leaves, get_internal
from external.utils.metrics import ks_pvalue, accuracy_imputation, correlations, mse, knn_purity, knn_purity_stratified
from external.utils.plots_util import plot_histograms, plot_scatter_mean, plot_ecdf_ks, plot_density, plot_embedding
from external.utils.plots_util import plot_losses, plot_elbo, plot_common_ancestor, plot_one_gene, training_dashboard
from external.utils.baselines import avg_weighted_baseline, scvi_baseline, scvi_baseline_z, cascvi_baseline_z, avg_baseline_z, construct_latent



# 1. Simulations (Gaussian Likelihood model)

We assume that the latent variables $z \in \mathbb{R}^{N \times D}$ are gaussian (correlated). A phylogenetic tree $\tau$ (with $N$ nodes) encodes the covariance $\Sigma$ of $z$. 

$$\mathbf{z}=(z_1, ..., z_N) \sim \mathcal{N}(0, \Sigma)$$

$z$ is partitionned into two groups:

- the leaves $\mathcal{L} = {1, ..., L}$
- the internal nodes $\mathcal{I} = {L + 1, ..., N}$

***

***We describe the generative model***:

Consider a dataset of $ X={x_n}_{n=1}^{L} $ (also partitioned such that $1, ..., N = \mathcal{L} \bigcup \mathcal{I}$) such that $x_n \in \mathbb{R}^{P}$. We aim to represent each $x_n$ under a latent variable $z_n \in \mathbb{R}^{D}$ with  with $D << P$ lower dimension. 
We only observe data at the leaves. the generative model is defined $\forall n \in \mathcal{L}$

The set of principal axes $W$ relates the latent variables to the data.

The corresponding data point is generated via a projection:

$$
\forall n \in \mathcal{L}, x_n =  W z_n + e_n
$$

with $W \in \mathbb{R}^{P x D}$ and $e_n \sim \mathcal{N}(0, \sigma^2 I_P)$. Thus:


$$
\forall n \in \mathcal{L},  x_n | z_n \sim \mathcal{N}(W z_n, \sigma^2 I_P)
$$

After marginalization

$$
\forall n \in \mathcal{L}, x_n \sim \mathcal{N}(0, W^T W + \sigma^2 I_P)
$$

The posterior $p(z_n|x_n)$ for each $n$ is also ***tractable***, indeed

$\begin{pmatrix} x_n \\ z_n \end{pmatrix} = \begin{pmatrix} W z_n + e \\ z_n \end{pmatrix}$ is a gaussian vector (because for $a \in \mathbb{R}$, $b \in \mathbb{R}$, $a(W z_n + e) + bz_n$ is still gaussian) such that:

$$
\begin{pmatrix} x_n \\ z_n \end{pmatrix} \sim \mathcal{N}(\begin{pmatrix} 0 \\ 0 \end{pmatrix}, \begin{pmatrix} W^T W + \sigma^2 I_P  & W\Sigma_n \\ (W\Sigma_n)^T & \Sigma_n \end{pmatrix})
$$

where $\Sigma_n$ is the marginalized covariance $\Sigma$ of $z_n$

Therefore we can use the conditioning formula to infer the mean and the covariance of the (gaussian) posterior $p(z_n|x_n)$:

$$
\mu_{z_n|x_n} = (W\Sigma_{n}(W^{T} W + \sigma^{2} I_{P})^{-1}\Sigma_{n}^{T}W^{T}) x_{n} \\
\Sigma_{z_n|x_n} = \Sigma_n - W\Sigma_{n}(W^{T} W + \sigma^{2} I_{P})^{-1}\Sigma_{n}^{T}W^{T}
$$

***

***Imputation at internal nodes***

Let $j \in \mathcal{I}$, and $X_{\mathcal{L}} = {x_1, ... x_L}$ the set of leaves.
We want to infer $p(x_j|X_{\mathcal{L}})$. If we consider that the data at the internal nodes is "seen" and that the generative model is also known $\forall n \in \mathcal{I}$, we could easily (and accurately) compute $p(x_j|X_{\mathcal{L}})$ by using the gaussian conditioning formula on the gaussian vector:

$$
\begin{pmatrix} x_j \\ X_{\mathcal{L}} \end{pmatrix}
$$

In the case of unseen data at the internal nodes, one can estimate the posterior predictive density:

1. $$
p(x_j|X_{\mathcal{L}}) = p(x_j|x_1, ..., x_L) = \int p(x_j|z_j)p(z_j|z_1,...,z_L)\prod_{i=1}^{L}p(z_i|x_i)(dz_j,dz_1,...,dz_L)
$$

Therefore:
$$
p(x_j|x_1, ..., x_L) \approx  p(x_j|z_j)p(z_j|z_1,...,z_L)\prod_{i=1}^{L}p(z_i|x_i)
$$

$$
p(x_j|x_1, ..., x_L) \approx  \mathcal{N}(x_j|Wz_j, \sigma^2I_P)  \mathcal{N}(z_j|\mu_{j|\mathcal{I}}, \Sigma_{j|\mathcal{I}}) \prod_{i=1}^{L} \mathcal{N}(z_i|\mu_{z_i|x_i}, \Sigma_{z_i|x_i})
$$

2. $ p(x_j|X_{\mathcal{L}}) = Wp(z_j|X_{\mathcal{L}}) + p(e_j)$

In [9]:
print(tree)


                     /-54
                  /-|
               /-|   \-55
              |  |
              |   \-33
              |
              |   /-34
              |  |
            /-|  |               /-152
           |  |  |            /-|
           |  |  |           |   \-153
           |  |  |         /-|
           |  |  |        |  |   /-154
           |  |  |      /-|   \-|
           |  |  |     |  |      \-155
           |   \-|     |  |
           |     |     |   \-105
           |     |     |
           |     |     |            /-176
           |     |   /-|         /-|
           |     |  |  |      /-|   \-177
           |     |  |  |     |  |
           |     |  |  |   /-|   \-157
           |     |  |  |  |  |
           |     |  |  |  |  |   /-158
           |      \-|   \-|   \-|
           |        |     |      \-159
           |        |     |
           |        |      \-107
           |        |
           |        |   /-82
           |         \-|
          

***Branch Length***

In [10]:
eps = 1e-3
branch_length = {}
for node in tree.traverse('levelorder'):
    if node.is_root():
        branch_length[node.name] = 0.0
    else:
        branch_length[node.name] = node.dist
branch_length['prior_root'] = 1.0

In [11]:
import torch
    
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f9337a5add0>

In [12]:
d = 5
p = 100
vis = True
leaves_only = False
var = 1.0
sigma_scale = 2.0

#ppca = PPCA(tree, p, d, vis, leaves_only, var, sigma_scale)
ppca = PPCA(tree=tree, 
            dim=p, 
            latent=d, 
            vis=vis, 
            only=leaves_only,
            branch_length=branch_length, 
            sigma_scale=sigma_scale
            )

ppca.simulate_latent()

***Marginalization***

In [13]:
ppca.simulate_normal()
ppca.W.shape

(100, 5)

In [14]:
lik_tree = ppca.likelihood_obs(leaves_only=False)
lik_leaves = ppca.likelihood_obs(leaves_only=True)

print("Log-Likelihood of the tree {}".format(lik_tree))
print("LogLikelihood of the leaves {}".format(lik_leaves))

Log-Likelihood of the tree -37348.33439194971
LogLikelihood of the leaves -18650.1273364016


***Get data***

In [15]:
# Latent vectors
leaves_z, _, _ = get_leaves(ppca.z, ppca.mu, tree)

#FIXED training set
leaves_X, leaves_idx, mu = get_leaves(ppca.X, ppca.mu, tree)

# internal nodes data (for imputation)
internal_X, internal_idx, internal_mu = get_internal(ppca.X, ppca.mu, tree)

# internal nodes z
internal_z, _, _ = get_internal(ppca.z, ppca.mu, tree)

leaves_X.shape, mu.shape, internal_X.shape, internal_mu.shape, leaves_z.shape

((100, 100), (100, 100), (100, 100), (100, 100), (100, 5))

***Posterior Distributions***

***evidence***

In [16]:
evidence_leaves = ppca.get_evidence_leaves_levelorder(X=ppca.X, dim=ppca.dim)
evidence_leaves.shape

(10000,)

***Leaves covariance***

In [17]:
import time

t = time.time()
ppca.compute_leaves_covariance()

print('Data covariance computation + inversion took {}'.format(time.time() - t))

Data covariance computation + inversion took 52.85932683944702


In [18]:
#inverse_covariance_path = os.path.join('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external/dataset/#inverse_covariances'
#if os
#tree_name = 

***Posterior mean and covariance***

In [19]:
posterior_mean, posterior_cov = ppca.compute_posterior()

***Posterior predictive density***

In [20]:
#predictive_mean, predictive_cov = ppca.compute_posterior_predictive()

# Preliminary: Baselines

## Baseline 1: Unweighted Average of gene expression in Clade

The simple idea here is to impute the value of an internal node, with the (un)weighted average of the gene expression values of the leaves, taking the query internal node as the root of the subtree.

In [21]:
imputed_avg = avg_weighted_baseline(tree=tree, 
                                    weighted=False, 
                                    X=ppca.X,
                                    rounding=False
                                   )

#get internal nodes
avg_X = np.array([x for x in imputed_avg.values()]).reshape(-1, ppca.X.shape[1])
internal_avg_X, _, _ = get_internal(avg_X, ppca.mu, tree)

## Baseline 2: (groundtruth) posterior predictive density

In [22]:
#imputed_ppca = {}
#for n in tree.traverse('levelorder'):
#    if not n.is_leaf():
#        samples = np.array([np.random.multivariate_normal(mean=predictive_mean[n.name],
#                                                            cov=predictive_cov[n.name])
#                           for i in range(20)])
#        imputed_ppca[n.name] = np.mean(samples, axis=0)

#internal_ppca_X = np.array([x for x in imputed_ppca.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 3: Approximation through Message Passing (Oracle)


i.e, 

1. sample from $z_1, ..., z_n \sim p(z_1, ..., z_n|x_1, ..., x_n)$ (conditionning formula)
2. impute $z_i \sim p(z_i | z_1, ..., z_n)$ (Message Passing)
3. Decode $p(x_i|z_i) = W z_i + \sigma^2 I_P$ (Generative model)

In [23]:
posterior_mean_corr, posterior_cov_corr = ppca.compute_correlated_posterior()

In [24]:
imputed_mp, imputed_z_mp, predictive_mean_z, predictive_cov_z  = ppca.compute_approx_posterior_predictive(iid=False, use_MP=False, sample_size=200)

  cov=posterior_cov).reshape(-1, self.latent) for i in range(sample_size)])


In [25]:
imputed_X = np.array([x for x in imputed_mp.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 4: Approximation through Message Passing + iid posteriors

i.e, 

1. sample from marginal conditional $z_l \sim p(z_l|x_1) \forall l \in (1, ...,L)$ (conditionning formula)
2. impute $z_i \sim p(z_i | z_1, ..., z_n)$ (Message Passing)
3. Decode $p(x_i|z_i) = W z_i + \sigma^2 I_P$ (Generative model)

In [26]:
imputed_mp2, imputed_z_mp2, predictive_mean_z2, predictive_cov_z2  = ppca.compute_approx_posterior_predictive(iid=True, use_MP=False, sample_size=200)

  cov=posterior_cov[k]) for i in range(sample_size)])


In [27]:
imputed_X2 = np.array([x for x in imputed_mp2.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 5: Gaussian VAE decoded averaged latent space

In [28]:
# anndata
gene_dataset = GeneExpressionDataset()
gene_dataset.populate_from_data(leaves_X)

This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()


In [29]:
n_epochs = 400

vae = GaussianVAE(gene_dataset.nb_genes,
                  n_hidden=64,
                  n_layers=1,
                  n_latent=ppca.latent,
                  sigma_ldvae=None
              )

#new_weight = torch.from_numpy(ppca.W).float()

#with torch.no_grad():
    #vae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
    
#for param in vae.decoder.factor_regressor.fc_layers[0][0].parameters():
#    param.requires_grad = False
    
#vae.decoder

In [30]:
cuda_z = torch.from_numpy(leaves_z).float()
p_m, p_v = vae.decoder.forward( torch.from_numpy(leaves_z).float())
p_m = p_m.detach().cpu().numpy()

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(p_m, mu)
print("the distance is {}".format(mse))

the distance is 4.0476048556969575


In [31]:
use_cuda = False

trainer = GaussianTrainer(model=vae,
                          gene_dataset=gene_dataset,
                          train_size=1.0,
                          use_cuda=use_cuda,
                          frequency=10,
                          n_epochs_kl_warmup=None,
                         )

In [32]:
# train VAE
trainer.train(n_epochs=n_epochs, lr=1e-2) 

computing elbo
ELBO: 41120.38671875
computing elbo
ELBO: 41573.375
computing elbo
ELBO: 41639.9296875
training:   1%|▏         | 5/400 [00:00<00:08, 47.43it/s]computing elbo
ELBO: 34975.828125
computing elbo
ELBO: 34961.8125
computing elbo
ELBO: 35086.76953125
training:   4%|▍         | 17/400 [00:00<00:04, 86.04it/s]computing elbo
ELBO: 29249.11328125
computing elbo
ELBO: 29383.296875
computing elbo
ELBO: 29238.982421875
computing elbo
ELBO: 26393.66015625
computing elbo
ELBO: 26281.724609375
computing elbo
ELBO: 26265.5546875
training:   8%|▊         | 30/400 [00:00<00:03, 103.40it/s]computing elbo
ELBO: 24546.20703125
computing elbo
ELBO: 24615.16796875
computing elbo
ELBO: 24582.87109375
training:  11%|█         | 44/400 [00:00<00:03, 115.32it/s]computing elbo
ELBO: 22714.923828125
computing elbo
ELBO: 22663.859375
computing elbo
ELBO: 22670.310546875
training:  14%|█▍        | 56/400 [00:00<00:03, 109.99it/s]computing elbo
ELBO: 21642.6640625
computing elbo
ELBO: 21631.953125
comp

In [33]:
elbo_train = trainer.history["elbo_train_set"]
x = np.linspace(0, 100, (len(elbo_train)))
plt.plot(np.log(elbo_train), 
         label="train", color='blue',
         linestyle=':',
         linewidth=3
        )
        
plt.xlabel('Epoch')
plt.ylabel("ELBO")
plt.legend()
plt.title("Train history Gaussian VAE")
plt.show()

In [34]:
from sklearn.metrics import mean_squared_error

posterior =  trainer.create_posterior(model=vae,
                                      gene_dataset=gene_dataset
                                      )
                                      
qz_m, qz_v = posterior.get_latent(give_mean=True, give_cov=True)
mean_squared_error(qz_m, leaves_z)

2.682994701790448

In [35]:
imputed_avg_vae, imputed_avg_z, imputed_avg_cov_z = avg_baseline_z(tree=tree,
                                 model=vae,
                                 posterior=posterior,
                                 weighted=False,
                                 n_samples_z=1,
                                 gaussian=True,
                                 use_cuda=False,
                                 give_cov=True
                                )

internal_vae_X = np.array([x for x in imputed_avg_vae.values()]).reshape(-1, ppca.X.shape[1])
internal_vae_X.shape    

(100, 100)

In [36]:
posterior.elbo()

computing elbo
ELBO: 16042.712890625


160.42712890625

# 3. Our Model: GaussianTreeVAE

In [37]:
import scanpy as sc

adata = AnnData(leaves_X)
adata.obs_names = [n.name for n in tree.traverse('levelorder') if n.is_leaf()]
scvi_dataset = AnnDatasetFromAnnData(adata, filtering=False)
scvi_dataset.initialize_cell_attribute('barcodes', adata.obs_names)

#TreeDataset
cas_dataset = TreeDataset(scvi_dataset, tree=tree, filtering=False)
cas_dataset

This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
X contains continuous and/or negative values. Please use raw UMI/read counts with scVI
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
go


GeneExpressionDataset object with n_cells x nb_genes = 100 x 100
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'barcodes', 'batch_indices', 'labels', 'local_vars', 'local_means'
    cell_categorical_attribute_names: 'labels', 'batch_indices'

In [38]:
use_cuda = True
use_MP = True

treevae = GaussianTreeVAE(cas_dataset.nb_genes,
              tree = cas_dataset.tree,
              n_latent=ppca.latent,
              n_hidden=64,
              n_layers=1,
              prior_t = branch_length,
              use_MP=use_MP,
              sigma_ldvae=None
             )

***Freezing the decoder***

In [39]:
#new_weight = torch.from_numpy(ppca.W).float()

#with torch.no_grad():
    #treevae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
    
#for param in treevae.decoder.factor_regressor.fc_layers[0][0].parameters():
    #param.requires_grad = False
    
#treevae.decoder

In [40]:
#assert(treevae.decoder.factor_regressor.fc_layers[0][0].weight.numpy().all() == ppca.W.T.all())


***Are we able to generate the gene expression data by decoding the simulated latent space?***

In [41]:
#p_m, p_v = treevae.decoder.forward(torch.from_numpy(leaves_z).float())
#p_m = p_m.detach().numpy()
#p_m.shape, mu.shape

In [42]:
#mse = mean_squared_error(p_m, mu)
#print("the distance is {}".format(mse))

***Training***

In [43]:
n_epochs = 600
lr = 1e-2
lambda_ = 1.0
freq = 10

tree_trainer = GaussianTreeTrainer(
        model = treevae,
        gene_dataset = cas_dataset,
        lambda_ = lambda_,
        train_size=1.0,
        test_size=0,
        use_cuda=use_cuda,
        frequency=freq,
        n_epochs_kl_warmup=None
    )

train_leaves:  [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99]]
test_leaves:  []
validation leaves:  []


In [44]:
tree_trainer.train(n_epochs=n_epochs,
              lr=lr
              )

computing elbo
training:   2%|▏         | 9/600 [00:01<01:26,  6.80it/s]computing elbo
training:   3%|▎         | 19/600 [00:02<01:25,  6.79it/s]computing elbo
training:   5%|▍         | 29/600 [00:04<01:23,  6.81it/s]computing elbo
training:   6%|▋         | 39/600 [00:05<01:22,  6.80it/s]computing elbo
training:   8%|▊         | 48/600 [00:07<01:21,  6.80it/s]computing elbo
training:  10%|▉         | 59/600 [00:08<01:03,  8.58it/s]computing elbo
training:  12%|█▏        | 69/600 [00:09<01:01,  8.58it/s]computing elbo
training:  13%|█▎        | 79/600 [00:10<00:59,  8.81it/s]computing elbo
training:  15%|█▍        | 89/600 [00:11<00:59,  8.62it/s]computing elbo
training:  16%|█▋        | 99/600 [00:12<00:58,  8.52it/s]computing elbo
training:  18%|█▊        | 109/600 [00:14<01:25,  5.72it/s]computing elbo
training:  20%|█▉        | 118/600 [00:15<00:28, 16.92it/s]computing elbo
training:  21%|██        | 127/600 [00:15<00:20, 23.30it/s]computing elbo
training:  23%|██▎       | 139/600

In [44]:
training_dashboard(tree_trainer, treevae.encoder_variance)

In [45]:
tree_posterior = tree_trainer.create_posterior(model=treevae,
                                              gene_dataset=cas_dataset,
                                               clades=tree_trainer.clades,
                                               indices=np.arange(len(cas_dataset))
                                              )
tree_latent = tree_posterior.get_latent()
tree_latent.shape, internal_z.shape

((100, 5), (100, 5))

***Missing Value Imputation***

In [55]:
ppca.X.shape

(200, 100)

In [56]:
# CascVI imputations
imputed = {}
imputed_z = {}
imputed_cov_z = {}
imputed_mean_z = {}
imputed_gt = {}
for i, n in enumerate(tree.traverse('levelorder')):
    if not n.is_leaf():
        imputed[n.name], z, imputed_mean_z[n.name], imputed_cov_z[n.name] = tree_posterior.imputation_internal(query_node=n,
                                                                                             pp_averaging=None,
                                                                                              z_averaging=None,
                                                                                              give_mean=True 
                                                                                            )  
        imputed_z[n.name] = z.cpu().numpy()
        imputed_gt[n.name] = ppca.X[n.index]
                                                                                                                                         

In [47]:
internal_treevae_X = [x for x in imputed.values()]
internal_treevae_X = np.array(internal_treevae_X).reshape(-1, cas_dataset.X.shape[1])

In [48]:
tree_posterior.compute_elbo(treevae)

computing elbo


tensor(144.8523, device='cuda:0', grad_fn=<DivBackward0>)

# Evaluation

## Evaluation 1.a.i: MSE/MAE L2/L1 error

In [49]:
from external.utils.metrics import mse

data = {'groundtruth': imputed_X, 'average': internal_avg_X,
        'gaussian VAE': internal_vae_X
        , 'gaussian treeVAE': internal_treevae_X
       }

In [50]:
results = mse(data=data, metric='MSE')
print('L2')
results

L2


Unnamed: 0,average,gaussian VAE,gaussian treeVAE
MSE,0.716502,0.831619,0.542612
std,0.311072,0.386643,0.18359


In [51]:
results = mse(data=data, metric='L1')
print('L1')
results

L1


Unnamed: 0,average,gaussian VAE,gaussian treeVAE
MSE,0.65287,0.707291,0.581885
std,0.161766,0.16669,0.094513


## Evaluation 1.a.ii: Correlations 

In [52]:
data = {'groundtruth': imputed_X, 'average': internal_avg_X,
        'gaussian VAE': internal_vae_X
        , 'gaussian treeVAE': internal_treevae_X
       }

df1 = correlations(data, 'None', True)

In [53]:
data_dict = {}
methods = list(data.keys())[1:]
for method in methods:
    data_dict[method] = list(df1[df1.Method==method].mean())
results_corr = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Spearman CC', 'Pearson CC', 'Kendal Tau CC'])

print('gene-gene correlation')
results_corr.head(10)

gene-gene correlation


Unnamed: 0,Spearman CC,Pearson CC,Kendal Tau CC
average,0.757535,0.781487,0.583554
gaussian VAE,0.771515,0.798094,0.598016
gaussian treeVAE,0.778398,0.813205,0.608073


## Evaluation 1.a.iii: MSE/MAE (L1/L2) of variance in latent space

***Compute MCMC estimates of mean and variance parameters of internal_nodes***

In [57]:
imputed_mcmc_cov = {}
imputed_mcmc_mean = {}

mcmc_mean = {}
mcmc_cov = {}
M = 20

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        # with Approximate posterior q(z|x)
        imputed_mcmc_mean[n.name], imputed_mcmc_cov[n.name] = tree_posterior.mcmc_estimate(query_node=n,
                                                                                            n_samples=M
                                                                                            )
        # with groundtruth posterior p(z|x)
        #mean=posterior_mean_corr
        #cov=posterior_cov_corr

        #mcmc_mean[n.name], mcmc_cov[n.name] = tree_posterior.mcmc_estimate(query_node=n,
                                                            #n_samples=M,
                                                            #known_latent_dist=(mean, cov)
                                                            #)

In [61]:
mean_squared_error(imputed_gt['0'], imputed['0'][0])

4.817789372250834

In [65]:
(tree & '3').get_distance(tree)

0.3903205879273148

In [None]:
L = []
columns = ['node index', 'imputation', 'MSE', 'variance', 'node depth' ]
for n in tree.traverse('levelorder'):
    depth = n.get_distance(tree)
    if not n.is_leaf():
        mse = mean_squared_error(imputed_gt[n.name], imputed[n.name][0])
        L.append([n.name, imputed_cov_z[n.name], mse, imputed_cov_z[n.name]

In [78]:
tree_posterior.generate()

NameError: name 'qz_m' is not defined

In [77]:
imputed_X.shape

(100, 100)

In [75]:
imputed_mcmc_cov

{'0': array([7.28446868e-09, 1.00191857e-08, 6.80379276e-09, 1.03124424e-08,
        1.05196603e-08]),
 '1': array([1.48976283e-08, 1.58957145e-08, 1.35248075e-08, 6.29483367e-09,
        1.10828952e-08]),
 '2': array([1.28683474e-08, 1.02599737e-08, 1.62219869e-08, 1.33261366e-08,
        1.56799913e-08]),
 '3': array([2.45763020e-08, 3.50777338e-08, 4.28389654e-08, 3.19752706e-08,
        3.86172695e-08]),
 '4': array([7.56488528e-09, 1.76438960e-08, 1.25520052e-08, 6.90304682e-09,
        1.10982570e-08]),
 '5': array([3.03478165e-08, 2.13465449e-08, 1.27259256e-08, 1.73431997e-08,
        2.11476795e-08]),
 '6': array([9.80821938e-08, 8.50571008e-08, 1.36539781e-07, 1.06467712e-07,
        7.90648115e-08]),
 '7': array([1.16072321e-08, 2.79698563e-08, 3.48883512e-08, 3.51792573e-08,
        4.36476263e-08]),
 '8': array([1.99283039e-08, 3.47852663e-08, 2.57790294e-08, 3.36956154e-08,
        2.89116786e-08]),
 '9': array([4.76693049e-09, 8.97401771e-09, 1.16134921e-08, 8.41576369e-

In [73]:
imputed_cov_z

{'0': 0.24302152011611297,
 '1': 0.12194597490486248,
 '2': 0.06176404939036358,
 '3': 0.10438567271388283,
 '4': 0.055510039397438614,
 '5': 0.08229011469305726,
 '6': 0.06773109981008792,
 '7': 0.0661713643254809,
 '8': 0.05695970521827682,
 '9': 0.0566618979685331,
 '10': 0.06938401838877038,
 '11': 0.06943987987633687,
 '12': 0.06288744983743538,
 '13': 0.068251875676195,
 '14': 0.05882370213041573,
 '15': 0.06414659854241286,
 '16': 0.08118295065822549,
 '17': 0.05595980091992343,
 '18': 0.049198380718610296,
 '19': 0.059218430486366096,
 '20': 0.014545132470653246,
 '22': 0.044060400947064135,
 '23': 0.048521139819647716,
 '24': 0.034801686200695035,
 '28': 0.026702235794682047,
 '30': 0.06355847297771028,
 '31': 0.005695149757908138,
 '32': 0.0016672399823764671,
 '35': 0.05647177669384572,
 '36': 0.03524907839636786,
 '37': 0.04537359689349309,
 '38': 0.061546257611307165,
 '39': 0.0536550447658896,
 '42': 0.020117583512007303,
 '43': 0.02512782829616021,
 '45': 0.0170864929197

In [72]:
tree_posterior.empirical_qz_v(n_samples=100, norm=True)

[2.7559455888453757e-06,
 1.584179624159727e-06,
 1.7788317937559551e-06,
 2.4794442550020067e-06,
 8.795793985036122e-07,
 4.113604934920293e-07,
 1.0288863679335622e-06,
 1.3124378121461495e-06,
 2.2075195238431288e-06,
 1.8849702812220777e-06,
 1.680444083441702e-06,
 1.6834717203912646e-06,
 6.616613223207755e-07,
 1.8789993108738142e-06,
 2.1022382211715547e-06,
 1.2385286243127669e-06,
 1.2652724572323076e-06,
 1.1575661376722314e-06,
 2.1825953383276195e-06,
 1.231602087143278e-06,
 1.5889615861198792e-06,
 2.02712136350078e-06,
 2.8952047610826823e-06,
 2.300668724570049e-06,
 3.253590147925166e-06,
 1.1546966209830115e-06,
 2.1390120459758177e-06,
 9.547828743487263e-07,
 1.0369467320271638e-06,
 1.8416248738212853e-06,
 7.352091051327267e-07,
 1.7024493018449285e-06,
 1.8019299658278693e-06,
 1.6451066099432706e-06,
 1.3200710579430444e-06,
 1.5193027697562935e-06,
 1.958755022391822e-06,
 1.281442482187764e-06,
 1.2232892901809918e-06,
 1.586819186031363e-06,
 1.566551548142

In [60]:
data_var = {'groundtruth': mcmc_cov, 'gaussian VAE':imputed_avg_cov_z, 'gaussian treeVAE': imputed_mcmc_cov}
data_mean = {'groundtruth': mcmc_mean, 'gaussian VAE':imputed_avg_z, 'gaussian treeVAE': imputed_mcmc_mean}

In [61]:
def error_latent(tree, predictive_z, imputed_avg_z, imputed_z, do_variance=False):
    mse_treevae = 0
    mse_vae = 0
    N = 0
    for n in tree.traverse('levelorder'):
        if not n.is_leaf():
            if do_variance:
                true_cov = predictive_z[n.name]
                vae_cov = imputed_avg_z[n.name].cpu().numpy()
                treevae_cov = imputed_z[n.name]

                mse_treevae += mean_squared_error(true_cov, treevae_cov)
                mse_vae += mean_squared_error(true_cov, vae_cov)
            else:
                mse_treevae += mean_squared_error(predictive_z[n.name], imputed_z[n.name])
                mse_vae += mean_squared_error(predictive_z[n.name], imputed_avg_z[n.name][0])
            N += 1
    mse_treevae /= N
    mse_vae /= N

    return mse_treevae, mse_vae

In [62]:
print('MCMC total Variance')
error_latent(tree, mcmc_cov, imputed_avg_cov_z, imputed_mcmc_cov, True)

(0.0004237482295142097, 0.00035844850200526855)

In [64]:
print('MCMC mean')
error_latent(tree, mcmc_mean, imputed_avg_z, imputed_mcmc_mean)

MCMC mean


(0.9143929862687368, 1.85150199980163)

## Evaluation 1.a.iv: Averaged KL divergence (internal nodes)

***TreeVAE***

In [67]:
from torch.distributions import Normal, kl_divergence
from sklearn.preprocessing import normalize

kl_mean = 0
N = 0
for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        # true distribution
        mean_true = torch.from_numpy(mcmc_mean[n.name])
        cov_true = torch.sqrt(torch.from_numpy(mcmc_cov[n.name]))
        dist_true = Normal(mean_true,
                        cov_true
                        )
        
        # Approx
        mean_approx = torch.from_numpy(imputed_mcmc_mean[n.name])
        cov_approx = torch.sqrt(torch.from_numpy(imputed_mcmc_cov[n.name]))
        dist_approx = Normal(mean_approx,
                        cov_approx
                        )

        kl_mean += kl_divergence(dist_true, dist_approx).sum()
        N += 1
kl_mean /= N
print('Average Kl divergence {}'.format(kl_mean))

Average Kl divergence 129776636.19200827


***VAE***

In [68]:
kl_mean = 0
N = 0
for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        mean_true = torch.from_numpy(mcmc_mean[n.name])
        cov_true = torch.sqrt(torch.from_numpy(mcmc_cov[n.name]))
        dist_true = Normal(mean_true,
                        cov_true
                        )

        dist_approx = Normal(torch.from_numpy(imputed_avg_z[n.name]),
                        torch.sqrt(imputed_avg_cov_z[n.name].cpu())
                        )
        kl_mean += kl_divergence(dist_true, dist_approx).sum()
        N += 1
kl_mean /= N
print('Average Kl divergence {}'.format(kl_mean))

Average Kl divergence 5374.677030454518


## Evaluation 1.a.v: Likelihood (internal nodes)

In [70]:
from scipy.stats import multivariate_normal

def mean_posterior_lik(tree, predictive_mean_z, imputed_avg_z, imputed_mean_z, predictive_cov_z, imputed_avg_cov_z, imputed_cov_z):
    treevae_lik = 0
    vae_lik = 0
    N = 0
    for n in tree.traverse('levelorder'):
        if not n.is_leaf():
            # mean
            true_mean = predictive_mean_z[n.name]
            vae_mean = imputed_avg_z[n.name][0]
            treevae_mean = imputed_mean_z[n.name].cpu().numpy()

            # covariance
            true_cov = np.diag(predictive_cov_z[n.name])
            vae_cov = np.diag(imputed_avg_cov_z[n.name].cpu().numpy())
            treevae_cov = np.diag(imputed_cov_z[n.name] * np.ones((d)))

            sample_treevae = np.random.multivariate_normal(mean=treevae_mean,
                                                            cov=treevae_cov)
            sample_vae = np.random.multivariate_normal(mean=vae_mean,
                                                        cov=vae_cov)

            treevae_lik += multivariate_normal.logpdf(sample_treevae,
                                                    true_mean,
                                                    true_cov)
            vae_lik += multivariate_normal.logpdf(sample_vae,
                                                    true_mean,
                                                    true_cov)
            
            N += 1

    vae_lik /= N
    treevae_lik /= N
    return [vae_lik, treevae_lik]

mean_posterior_lik(tree, predictive_mean_z, imputed_avg_z, imputed_mean_z, predictive_cov_z, imputed_avg_cov_z, imputed_cov_z)

[-461.48781196850234, -251.76246046200959]

## Evaluation 1.a.vi: (More) Latent space metrics 

#### 1. Prior likelihood

In [71]:
cascvi_latent = tree_posterior.get_latent()
scvi_latent = posterior.get_latent()

scvi_latent.shape, cascvi_latent.shape

((100, 5), (100, 5))

In [72]:
treevae.initialize_visit()
treevae.initialize_messages(scvi_latent, cas_dataset.barcodes, scvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), scvi_latent.shape[1], False)
mp_lik_vae = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of scVI encodings: ", mp_lik_vae.item())

Likelihood of scVI encodings:  -2826.691510321084


In [73]:
treevae.initialize_visit()
treevae.initialize_messages(cascvi_latent, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of cascVI encodings: ", mp_lik_cascvi.item())

Likelihood of cascVI encodings:  -369.6501111715011


In [74]:
treevae.initialize_visit()
treevae.initialize_messages(leaves_z, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of observations: ", mp_lik.item())

Likelihood of observations:  -222.3884709247602


### 2. k-nn purity

In [121]:
full_cascvi_latent = construct_latent(tree, cascvi_latent, imputed_z)
full_scvi_latent = construct_latent(tree, scvi_latent, imputed_avg_z)

In [143]:
print("Leaves Only")
data = {'groundtruth': leaves_z, 'scVI': scvi_latent,
        'cascVI': cascvi_latent
        }
scores = knn_purity(max_neighbors=30,
                    data=data,
                    plot=False,
                    save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/knn_purity_plot.png'
                    )
np.mean(scores['scVI']), np.mean(scores['cascVI'])

Leaves Only
No handles with labels found to put in legend.


(0.446926282876483, 0.5216776947687367)

In [204]:
df = pd.DataFrame()
df

In [199]:
neighbors = list(range(2, 30))
data = {'K': neighbors, 'scVI': scores['scVI'], 'cascVI': scores['cascVI']}
df2 = pd.DataFrame(data)

In [205]:
df = df.append(df2)
df

Unnamed: 0,K,scVI,cascVI
0,2,0.492537,0.532567
1,3,0.431981,0.507538
2,4,0.449275,0.520913
3,5,0.432665,0.536098
4,6,0.413428,0.540436
5,7,0.398601,0.526718
6,8,0.41218,0.52381
7,9,0.413983,0.505017
8,10,0.424501,0.500375
9,11,0.422107,0.503759


In [142]:
print("Internal nodes Only")
internal_z, internal_idx, internal_mu = get_internal(ppca.z, ppca.mu, tree)
internal_scvi_z, _, _ = get_internal(full_scvi_latent, ppca.mu, tree)
internal_cascvi_z, _, _ = get_internal(full_cascvi_latent, ppca.mu, tree)

data = {'groundtruth': internal_z, 'scVI': internal_scvi_z,
        'cascVI': internal_cascvi_z
        }

scores = knn_purity(max_neighbors=30,
              data=data,
              plot=False,
              save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/knn_purity.png'
              )

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Internal nodes Only
No handles with labels found to put in legend.


(0.36514020314439677, 0.45141649792859023)

In [141]:
print("Full tree")
data = {'groundtruth': ppca.z, 'scVI': full_scvi_latent,
        'cascVI': full_cascvi_latent
        }
scores = knn_purity(max_neighbors=30,
              data=data,
              plot=False)

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Full tree
No handles with labels found to put in legend.


(0.3584607764591899, 0.4458153596517597)