# 0. Standard imports

In [104]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import copy
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('WebAgg')
import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external


In [3]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi


In [4]:
%reload_ext autoreload
%matplotlib inline

***import ete3 Tree***

In [5]:
from ete3 import Tree

tree_name = "/home/eecs/khalil.ouardini/cas_scvi_topologies/newick_objects/100cells/no_fitness/topology4.nwk"
tree = Tree(tree_name, 1)

leaves = tree.get_leaves()

for i, n in enumerate(tree.traverse('levelorder')):
    n.add_features(index=i)
    n.name = str(i)

In [157]:
# Data
from anndata import AnnData
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from external.dataset.tree import TreeDataset, GeneExpressionDataset
from external.dataset.ppca import PPCA
from external.dataset.anndataset import AnnDatasetFromAnnData

# Models
import scanpy as sc
from external.inference.gaussian_inference import GaussianTrainer
from external.inference.gaussian_tree_inference import GaussianTreeTrainer
from external.inference.gaussian_tree_inference import GaussianTreePosterior
from external.inference.gaussian_inference import GaussianPosterior
from external.models.treevae import TreeVAE
from external.models.gaussian_vae import GaussianVAE
from external.models.gaussian_treevae import GaussianTreeVAE

# Utils
from external.utils.data_util import get_leaves, get_internal
from external.utils.metrics import ks_pvalue, accuracy_imputation, correlations, mse, knn_purity, knn_purity_stratified
from external.utils.plots_util import plot_histograms, plot_scatter_mean, plot_ecdf_ks, plot_density, plot_embedding
from external.utils.plots_util import plot_losses, plot_elbo, plot_common_ancestor, plot_one_gene, training_dashboard
from external.utils.baselines import avg_weighted_baseline, scvi_baseline, scvi_baseline_z, cascvi_baseline_z, avg_baseline_z, construct_latent

# 1. Simulations (Gaussian Likelihood model)

We assume that the latent variables $z \in \mathbb{R}^{N \times D}$ are gaussian (correlated). A phylogenetic tree $\tau$ (with $N$ nodes) encodes the covariance $\Sigma$ of $z$. 

$$\mathbf{z}=(z_1, ..., z_N) \sim \mathcal{N}(0, \Sigma)$$

$z$ is partitionned into two groups:

- the leaves $\mathcal{L} = {1, ..., L}$
- the internal nodes $\mathcal{I} = {L + 1, ..., N}$

***

***We describe the generative model***:

Consider a dataset of $ X={x_n}_{n=1}^{L} $ (also partitioned such that $1, ..., N = \mathcal{L} \bigcup \mathcal{I}$) such that $x_n \in \mathbb{R}^{P}$. We aim to represent each $x_n$ under a latent variable $z_n \in \mathbb{R}^{D}$ with  with $D << P$ lower dimension. 
We only observe data at the leaves. the generative model is defined $\forall n \in \mathcal{L}$

The set of principal axes $W$ relates the latent variables to the data.

The corresponding data point is generated via a projection:

$$
\forall n \in \mathcal{L}, x_n =  W z_n + e_n
$$

with $W \in \mathbb{R}^{P x D}$ and $e_n \sim \mathcal{N}(0, \sigma^2 I_P)$. Thus:


$$
\forall n \in \mathcal{L},  x_n | z_n \sim \mathcal{N}(W z_n, \sigma^2 I_P)
$$

After marginalization

$$
\forall n \in \mathcal{L}, x_n \sim \mathcal{N}(0, W^T W + \sigma^2 I_P)
$$

The posterior $p(z_n|x_n)$ for each $n$ is also ***tractable***, indeed

$\begin{pmatrix} x_n \\ z_n \end{pmatrix} = \begin{pmatrix} W z_n + e \\ z_n \end{pmatrix}$ is a gaussian vector (because for $a \in \mathbb{R}$, $b \in \mathbb{R}$, $a(W z_n + e) + bz_n$ is still gaussian) such that:

$$
\begin{pmatrix} x_n \\ z_n \end{pmatrix} \sim \mathcal{N}(\begin{pmatrix} 0 \\ 0 \end{pmatrix}, \begin{pmatrix} W^T W + \sigma^2 I_P  & W\Sigma_n \\ (W\Sigma_n)^T & \Sigma_n \end{pmatrix})
$$

where $\Sigma_n$ is the marginalized covariance $\Sigma$ of $z_n$

Therefore we can use the conditioning formula to infer the mean and the covariance of the (gaussian) posterior $p(z_n|x_n)$:

$$
\mu_{z_n|x_n} = (W\Sigma_{n}(W^{T} W + \sigma^{2} I_{P})^{-1}\Sigma_{n}^{T}W^{T}) x_{n} \\
\Sigma_{z_n|x_n} = \Sigma_n - W\Sigma_{n}(W^{T} W + \sigma^{2} I_{P})^{-1}\Sigma_{n}^{T}W^{T}
$$

***

***Imputation at internal nodes***

Let $j \in \mathcal{I}$, and $X_{\mathcal{L}} = {x_1, ... x_L}$ the set of leaves.
We want to infer $p(x_j|X_{\mathcal{L}})$. If we consider that the data at the internal nodes is "seen" and that the generative model is also known $\forall n \in \mathcal{I}$, we could easily (and accurately) compute $p(x_j|X_{\mathcal{L}})$ by using the gaussian conditioning formula on the gaussian vector:

$$
\begin{pmatrix} x_j \\ X_{\mathcal{L}} \end{pmatrix}
$$

In the case of unseen data at the internal nodes, one can estimate the posterior predictive density:

1. $$
p(x_j|X_{\mathcal{L}}) = p(x_j|x_1, ..., x_L) = \int p(x_j|z_j)p(z_j|z_1,...,z_L)\prod_{i=1}^{L}p(z_i|x_i)(dz_j,dz_1,...,dz_L)
$$

Therefore:
$$
p(x_j|x_1, ..., x_L) \approx  p(x_j|z_j)p(z_j|z_1,...,z_L)\prod_{i=1}^{L}p(z_i|x_i)
$$

$$
p(x_j|x_1, ..., x_L) \approx  \mathcal{N}(x_j|Wz_j, \sigma^2I_P)  \mathcal{N}(z_j|\mu_{j|\mathcal{I}}, \Sigma_{j|\mathcal{I}}) \prod_{i=1}^{L} \mathcal{N}(z_i|\mu_{z_i|x_i}, \Sigma_{z_i|x_i})
$$

2. $ p(x_j|X_{\mathcal{L}}) = Wp(z_j|X_{\mathcal{L}}) + p(e_j)$

In [7]:
print(tree)


                     /-54
                  /-|
               /-|   \-55
              |  |
              |   \-33
              |
              |   /-34
              |  |
            /-|  |               /-152
           |  |  |            /-|
           |  |  |           |   \-153
           |  |  |         /-|
           |  |  |        |  |   /-154
           |  |  |      /-|   \-|
           |  |  |     |  |      \-155
           |   \-|     |  |
           |     |     |   \-105
           |     |     |
           |     |     |            /-176
           |     |   /-|         /-|
           |     |  |  |      /-|   \-177
           |     |  |  |     |  |
           |     |  |  |   /-|   \-157
           |     |  |  |  |  |
           |     |  |  |  |  |   /-158
           |      \-|   \-|   \-|
           |        |     |      \-159
           |        |     |
           |        |      \-107
           |        |
           |        |   /-82
           |         \-|
          

***Branch Length***

In [8]:
eps = 1e-3
branch_length = {}
for node in tree.traverse('levelorder'):
    if node.is_root():
        branch_length[node.name] = 0.0
    else:
        branch_length[node.name] = node.dist
branch_length['prior_root'] = 1.0

In [9]:
import torch
    
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f15805ba3f0>

In [10]:
d = 5
p = 100
vis = True
leaves_only = False
var = 1.0
sigma_scale = 2.0

#ppca = PPCA(tree, p, d, vis, leaves_only, var, sigma_scale)
ppca = PPCA(tree=tree, 
            dim=p, 
            latent=d, 
            vis=vis, 
            only=leaves_only,
            branch_length=branch_length, 
            sigma_scale=sigma_scale
            )

ppca.simulate_latent()

***Marginalization***

In [11]:
ppca.simulate_normal()
ppca.W.shape

(100, 5)

In [12]:
lik_tree = ppca.likelihood_obs(leaves_only=False)
lik_leaves = ppca.likelihood_obs(leaves_only=True)

print("Log-Likelihood of the tree {}".format(lik_tree))
print("LogLikelihood of the leaves {}".format(lik_leaves))

Log-Likelihood of the tree -37348.33439194971
LogLikelihood of the leaves -18650.1273364016


***Get data***

In [34]:
# Latent vectors
leaves_z, _, _ = get_leaves(ppca.z, ppca.mu, tree)

#FIXED training set
leaves_X, leaves_idx, mu = get_leaves(ppca.X, ppca.mu, tree)

# internal nodes data (for imputation)
internal_X, internal_idx, internal_mu = get_internal(ppca.X, ppca.mu, tree)

# internal nodes z
internal_z, _, _ = get_internal(ppca.z, ppca.mu, tree)

leaves_X.shape, mu.shape, internal_X.shape, internal_mu.shape, leaves_z.shape

((100, 100), (100, 100), (100, 100), (100, 100), (100, 5))

***Posterior Distributions***

***evidence***

In [13]:
evidence_leaves = ppca.get_evidence_leaves_levelorder(X=ppca.X, dim=ppca.dim)
evidence_leaves.shape

(10000,)

***Leaves covariance***

In [15]:
import time

t = time.time()
ppca.compute_leaves_covariance()

print('Data covariance computation + inversion took {}'.format(time.time() - t))

Data covariance computation + inversion took 57.30646324157715


In [16]:
#inverse_covariance_path = os.path.join('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external/dataset/#inverse_covariances'
#if os
#tree_name = 

***Posterior mean and covariance***

In [16]:
posterior_mean, posterior_cov = ppca.compute_posterior()

***Posterior predictive density***

In [17]:
#predictive_mean, predictive_cov = ppca.compute_posterior_predictive()

# Preliminary: Baselines

## Baseline 1: Unweighted Average of gene expression in Clade

The simple idea here is to impute the value of an internal node, with the (un)weighted average of the gene expression values of the leaves, taking the query internal node as the root of the subtree.

In [18]:
imputed_avg = avg_weighted_baseline(tree=tree, 
                                    weighted=False, 
                                    X=ppca.X,
                                    rounding=False
                                   )

#get internal nodes
avg_X = np.array([x for x in imputed_avg.values()]).reshape(-1, ppca.X.shape[1])
internal_avg_X, _, _ = get_internal(avg_X, ppca.mu, tree)

## Baseline 2: (groundtruth) posterior predictive density

In [19]:
#imputed_ppca = {}
#for n in tree.traverse('levelorder'):
#    if not n.is_leaf():
#        samples = np.array([np.random.multivariate_normal(mean=predictive_mean[n.name],
#                                                            cov=predictive_cov[n.name])
#                           for i in range(20)])
#        imputed_ppca[n.name] = np.mean(samples, axis=0)

#internal_ppca_X = np.array([x for x in imputed_ppca.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 3: Approximation through Message Passing (Oracle)


i.e, 

1. sample from $z_1, ..., z_n \sim p(z_1, ..., z_n|x_1, ..., x_n)$ (conditionning formula)
2. impute $z_i \sim p(z_i | z_1, ..., z_n)$ (Message Passing)
3. Decode $p(x_i|z_i) = W z_i + \sigma^2 I_P$ (Generative model)

In [20]:
posterior_mean_corr, posterior_cov_corr = ppca.compute_correlated_posterior()

In [21]:
imputed_mp, imputed_z_mp, predictive_mean_z, predictive_cov_z  = ppca.compute_approx_posterior_predictive(iid=False, use_MP=False, sample_size=200)

  cov=posterior_cov).reshape(-1, self.latent) for i in range(sample_size)])


In [22]:
imputed_X = np.array([x for x in imputed_mp.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 4: Approximation through Message Passing + iid posteriors

i.e, 

1. sample from marginal conditional $z_l \sim p(z_l|x_1) \forall l \in (1, ...,L)$ (conditionning formula)
2. impute $z_i \sim p(z_i | z_1, ..., z_n)$ (Message Passing)
3. Decode $p(x_i|z_i) = W z_i + \sigma^2 I_P$ (Generative model)

In [23]:
imputed_mp2, imputed_z_mp2, predictive_mean_z2, predictive_cov_z2  = ppca.compute_approx_posterior_predictive(iid=True, use_MP=False, sample_size=200)

  cov=posterior_cov[k]) for i in range(sample_size)])


In [24]:
imputed_X2 = np.array([x for x in imputed_mp2.values()]).reshape(-1, ppca.X.shape[1])

## Baseline 5: Gaussian VAE decoded averaged latent space

In [158]:
# anndata
gene_dataset = GeneExpressionDataset()
gene_dataset.populate_from_data(leaves_X)

This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()


In [159]:
n_epochs = 400

vae = GaussianVAE(gene_dataset.nb_genes,
                  n_hidden=64,
                  n_layers=1,
                  n_latent=ppca.latent,
                  sigma_ldvae=None
              )

#new_weight = torch.from_numpy(ppca.W).float()

#with torch.no_grad():
    #vae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
    
#for param in vae.decoder.factor_regressor.fc_layers[0][0].parameters():
#    param.requires_grad = False
    
#vae.decoder

In [160]:
cuda_z = torch.from_numpy(leaves_z).float()
p_m, p_v = vae.decoder.forward( torch.from_numpy(leaves_z).float())
p_m = p_m.detach().cpu().numpy()

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(p_m, mu)
print("the distance is {}".format(mse))

the distance is 4.171938935357741


In [161]:
use_cuda = False

trainer = GaussianTrainer(model=vae,
                          gene_dataset=gene_dataset,
                          train_size=1.0,
                          use_cuda=use_cuda,
                          frequency=10,
                          n_epochs_kl_warmup=None,
                         )

In [162]:
# train VAE
trainer.train(n_epochs=n_epochs, lr=1e-2) 

computing elbo
ELBO: 40738.41796875
computing elbo
ELBO: 40802.6875
computing elbo
ELBO: 40781.56640625
training:   0%|          | 0/400 [00:00<?, ?it/s]computing elbo
ELBO: 34870.8984375
computing elbo
ELBO: 34841.890625
computing elbo
ELBO: 34851.796875
training:   2%|▎         | 10/400 [00:00<00:04, 94.03it/s]computing elbo
ELBO: 28965.013671875
computing elbo
ELBO: 28946.02734375
computing elbo
ELBO: 29005.869140625
training:   6%|▌         | 22/400 [00:00<00:03, 106.12it/s]computing elbo
ELBO: 27393.78515625
computing elbo
ELBO: 27490.80078125
computing elbo
ELBO: 27400.142578125
training:   8%|▊         | 34/400 [00:00<00:03, 109.35it/s]computing elbo
ELBO: 25259.228515625
computing elbo
ELBO: 25235.6015625
computing elbo
ELBO: 25177.056640625
training:  12%|█▏        | 46/400 [00:00<00:03, 110.99it/s]computing elbo
ELBO: 23687.515625
computing elbo
ELBO: 23674.3125
computing elbo
ELBO: 23650.0625
training:  14%|█▍        | 58/400 [00:00<00:03, 112.06it/s]computing elbo
ELBO: 216

In [163]:
elbo_train = trainer.history["elbo_train_set"]
x = np.linspace(0, 100, (len(elbo_train)))
plt.plot(np.log(elbo_train), 
         label="train", color='blue',
         linestyle=':',
         linewidth=3
        )
        
plt.xlabel('Epoch')
plt.ylabel("ELBO")
plt.legend()
plt.title("Train history Gaussian VAE")
plt.show()

Press Ctrl+C to stop WebAgg server


RuntimeError: This event loop is already running

In [164]:
from sklearn.metrics import mean_squared_error

posterior =  trainer.create_posterior(model=vae,
                                      gene_dataset=gene_dataset
                                      )
                                      
qz_m, qz_v = posterior.get_latent(give_mean=True, give_cov=True)
mean_squared_error(qz_m, leaves_z)

2.2993859185397083

In [165]:
imputed_avg_vae, imputed_avg_z, imputed_avg_cov_z = avg_baseline_z(tree=tree,
                                 model=vae,
                                 posterior=posterior,
                                 weighted=False,
                                 n_samples_z=1,
                                 gaussian=True,
                                 use_cuda=False,
                                 give_cov=True
                                )

internal_vae_X = np.array([x for x in imputed_avg_vae.values()]).reshape(-1, ppca.X.shape[1])
internal_vae_X.shape    

(100, 100)

In [167]:
posterior.elbo()

computing elbo
ELBO: 15272.53515625


152.7253515625

# 3. Our Model: GaussianTreeVAE

In [43]:
import scanpy as sc

adata = AnnData(leaves_X)
adata.obs_names = [n.name for n in tree.traverse('levelorder') if n.is_leaf()]
scvi_dataset = AnnDatasetFromAnnData(adata, filtering=False)
scvi_dataset.initialize_cell_attribute('barcodes', adata.obs_names)

#TreeDataset
cas_dataset = TreeDataset(scvi_dataset, tree=tree, filtering=False)
cas_dataset

This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
X contains continuous and/or negative values. Please use raw UMI/read counts with scVI
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
This dataset has some empty cells, this might fail scVI inference.Data should be filtered with `my_dataset.filter_cells_by_count()
go


GeneExpressionDataset object with n_cells x nb_genes = 100 x 100
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'local_vars', 'labels', 'batch_indices', 'barcodes', 'local_means'
    cell_categorical_attribute_names: 'batch_indices', 'labels'

In [44]:
use_cuda = True
use_MP = True

treevae = GaussianTreeVAE(cas_dataset.nb_genes,
              tree = cas_dataset.tree,
              n_latent=ppca.latent,
              n_hidden=64,
              n_layers=1,
              prior_t = branch_length,
              use_MP=use_MP,
              sigma_ldvae=None
             )

***Freezing the decoder***

In [45]:
#new_weight = torch.from_numpy(ppca.W).float()

#with torch.no_grad():
    #treevae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
    
#for param in treevae.decoder.factor_regressor.fc_layers[0][0].parameters():
    #param.requires_grad = False
    
#treevae.decoder

In [46]:
#assert(treevae.decoder.factor_regressor.fc_layers[0][0].weight.numpy().all() == ppca.W.T.all())


***Are we able to generate the gene expression data by decoding the simulated latent space?***

In [47]:
#p_m, p_v = treevae.decoder.forward(torch.from_numpy(leaves_z).float())
#p_m = p_m.detach().numpy()
#p_m.shape, mu.shape

In [48]:
#mse = mean_squared_error(p_m, mu)
#print("the distance is {}".format(mse))

***Training***

In [49]:
n_epochs = 600
lr = 1e-2
lambda_ = 1.0
freq = 10

tree_trainer = GaussianTreeTrainer(
        model = treevae,
        gene_dataset = cas_dataset,
        lambda_ = lambda_,
        train_size=1.0,
        test_size=0,
        use_cuda=use_cuda,
        frequency=freq,
        n_epochs_kl_warmup=None
    )

train_leaves:  [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99]]
test_leaves:  []
validation leaves:  []


In [50]:
tree_trainer.train(n_epochs=n_epochs,
              lr=lr
              )

computing elbo
training:   2%|▏         | 9/600 [00:00<00:23, 25.16it/s]computing elbo
training:   3%|▎         | 18/600 [00:00<00:23, 24.89it/s]computing elbo
training:   4%|▍         | 27/600 [00:01<00:23, 24.70it/s]computing elbo
training:   6%|▋         | 39/600 [00:01<00:22, 25.11it/s]computing elbo
training:   8%|▊         | 48/600 [00:01<00:22, 24.76it/s]computing elbo
training:  10%|▉         | 57/600 [00:02<00:22, 23.85it/s]computing elbo
training:  12%|█▏        | 69/600 [00:02<00:21, 24.72it/s]computing elbo
training:  13%|█▎        | 78/600 [00:03<00:20, 24.86it/s]computing elbo
training:  14%|█▍        | 87/600 [00:03<00:20, 24.84it/s]computing elbo
training:  16%|█▋        | 99/600 [00:04<00:19, 25.18it/s]computing elbo
training:  18%|█▊        | 108/600 [00:04<00:19, 24.73it/s]computing elbo
training:  20%|█▉        | 117/600 [00:04<00:19, 24.62it/s]computing elbo
training:  22%|██▏       | 129/600 [00:05<00:18, 24.86it/s]computing elbo
training:  23%|██▎       | 138/600

In [53]:
training_dashboard(tree_trainer, treevae.encoder_variance)

In [52]:
tree_posterior = tree_trainer.create_posterior(model=treevae,
                                              gene_dataset=cas_dataset,
                                               clades=tree_trainer.clades,
                                               indices=np.arange(len(cas_dataset))
                                              )
tree_latent = tree_posterior.get_latent()
tree_latent.shape, internal_z.shape

((100, 5), (100, 5))

***Missing Value Imputation***

In [97]:
# CascVI imputations
imputed = {}
imputed_z = {}
imputed_cov_z = {}
imputed_mean_z = {}

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        imputed[n.name], z, imputed_mean_z[n.name], imputed_cov_z[n.name] = tree_posterior.imputation_internal(query_node=n,
                                                                                             pp_averaging=None,
                                                                                              z_averaging=None,
                                                                                              give_mean=True 
                                                                                            )  
        imputed_z[n.name] = z.cpu().numpy()
                                                                                                                                         

In [98]:
internal_treevae_X = [x for x in imputed.values()]
internal_treevae_X = np.array(internal_treevae_X).reshape(-1, cas_dataset.X.shape[1])

In [146]:
tree_posterior.compute_elbo(treevae)

computing elbo


tensor(150.2321, device='cuda:0', grad_fn=<DivBackward0>)

# Evaluation

## Evaluation 1.a.i: MSE/MAE L2/L1 error

In [48]:
from external.utils.metrics import mse

data = {'groundtruth': imputed_X, 'average': internal_avg_X,
        'gaussian VAE': internal_vae_X
        , 'gaussian treeVAE': internal_treevae_X
       }

In [49]:
results = mse(data=data, metric='MSE')
print('L2')
results

L2


Unnamed: 0,average,gaussian VAE,gaussian treeVAE
MSE,0.716805,0.817597,0.566842
std,0.308642,0.387916,0.224257


In [50]:
results = mse(data=data, metric='L1')
print('L1')
results

L1


Unnamed: 0,average,gaussian VAE,gaussian treeVAE
MSE,26.317424,47.25809,57.16029
std,0.0,1.421085e-14,2.131628e-14


## Evaluation 1.a.ii: Correlations 

In [57]:
data = {'groundtruth': imputed_X, 'average': internal_avg_X,
        'gaussian VAE': internal_vae_X
        , 'gaussian treeVAE': internal_treevae_X
       }

df1 = correlations(data, 'None', True)

In [58]:
data_dict = {}
methods = list(data.keys())[1:]
for method in methods:
    data_dict[method] = list(df1[df1.Method==method].mean())
results_corr = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Spearman CC', 'Pearson CC', 'Kendal Tau CC'])

print('gene-gene correlation')
results_corr.head(10)

gene-gene correlation


Unnamed: 0,Spearman CC,Pearson CC,Kendal Tau CC
average,0.757535,0.781487,0.583554
gaussian VAE,0.771515,0.798094,0.598016
gaussian treeVAE,0.771032,0.806599,0.601248


## Evaluation 1.a.iii: MSE/MAE (L1/L2) of variance in latent space

***Compute MCMC estimates of mean and variance parameters of internal_nodes***

In [59]:
imputed_mcmc_cov = {}
imputed_mcmc_mean = {}

mcmc_mean = {}
mcmc_cov = {}
M = 5

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        # with Approximate posterior q(z|x)
        imputed_mcmc_mean[n.name], imputed_mcmc_cov[n.name] = tree_posterior.mcmc_estimate(query_node=n,
                                                                                            n_samples=M
                                                                                            )
        # with groundtruth posterior p(z|x)
        mean=posterior_mean_corr
        cov=posterior_cov_corr

        mcmc_mean[n.name], mcmc_cov[n.name] = tree_posterior.mcmc_estimate(query_node=n,
                                                            n_samples=M,
                                                            known_latent_dist=(mean, cov)
                                                            )

  cov=cov).reshape(-1, self.model.n_latent)


In [60]:
data_var = {'groundtruth': mcmc_cov, 'gaussian VAE':imputed_avg_cov_z, 'gaussian treeVAE': imputed_mcmc_cov}
data_mean = {'groundtruth': mcmc_mean, 'gaussian VAE':imputed_avg_z, 'gaussian treeVAE': imputed_mcmc_mean}

In [61]:
def error_latent(tree, predictive_z, imputed_avg_z, imputed_z, do_variance=False):
    mse_treevae = 0
    mse_vae = 0
    N = 0
    for n in tree.traverse('levelorder'):
        if not n.is_leaf():
            if do_variance:
                true_cov = predictive_z[n.name]
                vae_cov = imputed_avg_z[n.name].cpu().numpy()
                treevae_cov = imputed_z[n.name]

                mse_treevae += mean_squared_error(true_cov, treevae_cov)
                mse_vae += mean_squared_error(true_cov, vae_cov)
            else:
                mse_treevae += mean_squared_error(predictive_z[n.name], imputed_z[n.name])
                mse_vae += mean_squared_error(predictive_z[n.name], imputed_avg_z[n.name][0])
            N += 1
    mse_treevae /= N
    mse_vae /= N

    return mse_treevae, mse_vae

In [62]:
print('MCMC total Variance')
error_latent(tree, mcmc_cov, imputed_avg_cov_z, imputed_mcmc_cov, True)

(0.0004237482295142097, 0.00035844850200526855)

In [64]:
print('MCMC mean')
error_latent(tree, mcmc_mean, imputed_avg_z, imputed_mcmc_mean)

MCMC mean


(0.9143929862687368, 1.85150199980163)

## Evaluation 1.a.iv: Averaged KL divergence (internal nodes)

***TreeVAE***

In [67]:
from torch.distributions import Normal, kl_divergence
from sklearn.preprocessing import normalize

kl_mean = 0
N = 0
for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        # true distribution
        mean_true = torch.from_numpy(mcmc_mean[n.name])
        cov_true = torch.sqrt(torch.from_numpy(mcmc_cov[n.name]))
        dist_true = Normal(mean_true,
                        cov_true
                        )
        
        # Approx
        mean_approx = torch.from_numpy(imputed_mcmc_mean[n.name])
        cov_approx = torch.sqrt(torch.from_numpy(imputed_mcmc_cov[n.name]))
        dist_approx = Normal(mean_approx,
                        cov_approx
                        )

        kl_mean += kl_divergence(dist_true, dist_approx).sum()
        N += 1
kl_mean /= N
print('Average Kl divergence {}'.format(kl_mean))

Average Kl divergence 129776636.19200827


***VAE***

In [68]:
kl_mean = 0
N = 0
for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        mean_true = torch.from_numpy(mcmc_mean[n.name])
        cov_true = torch.sqrt(torch.from_numpy(mcmc_cov[n.name]))
        dist_true = Normal(mean_true,
                        cov_true
                        )

        dist_approx = Normal(torch.from_numpy(imputed_avg_z[n.name]),
                        torch.sqrt(imputed_avg_cov_z[n.name].cpu())
                        )
        kl_mean += kl_divergence(dist_true, dist_approx).sum()
        N += 1
kl_mean /= N
print('Average Kl divergence {}'.format(kl_mean))

Average Kl divergence 5374.677030454518


## Evaluation 1.a.v: Likelihood (internal nodes)

In [70]:
from scipy.stats import multivariate_normal

def mean_posterior_lik(tree, predictive_mean_z, imputed_avg_z, imputed_mean_z, predictive_cov_z, imputed_avg_cov_z, imputed_cov_z):
    treevae_lik = 0
    vae_lik = 0
    N = 0
    for n in tree.traverse('levelorder'):
        if not n.is_leaf():
            # mean
            true_mean = predictive_mean_z[n.name]
            vae_mean = imputed_avg_z[n.name][0]
            treevae_mean = imputed_mean_z[n.name].cpu().numpy()

            # covariance
            true_cov = np.diag(predictive_cov_z[n.name])
            vae_cov = np.diag(imputed_avg_cov_z[n.name].cpu().numpy())
            treevae_cov = np.diag(imputed_cov_z[n.name] * np.ones((d)))

            sample_treevae = np.random.multivariate_normal(mean=treevae_mean,
                                                            cov=treevae_cov)
            sample_vae = np.random.multivariate_normal(mean=vae_mean,
                                                        cov=vae_cov)

            treevae_lik += multivariate_normal.logpdf(sample_treevae,
                                                    true_mean,
                                                    true_cov)
            vae_lik += multivariate_normal.logpdf(sample_vae,
                                                    true_mean,
                                                    true_cov)
            
            N += 1

    vae_lik /= N
    treevae_lik /= N
    return [vae_lik, treevae_lik]

mean_posterior_lik(tree, predictive_mean_z, imputed_avg_z, imputed_mean_z, predictive_cov_z, imputed_avg_cov_z, imputed_cov_z)

[-461.48781196850234, -251.76246046200959]

## Evaluation 1.a.vi: (More) Latent space metrics 

#### 1. Prior likelihood

In [71]:
cascvi_latent = tree_posterior.get_latent()
scvi_latent = posterior.get_latent()

scvi_latent.shape, cascvi_latent.shape

((100, 5), (100, 5))

In [72]:
treevae.initialize_visit()
treevae.initialize_messages(scvi_latent, cas_dataset.barcodes, scvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), scvi_latent.shape[1], False)
mp_lik_vae = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of scVI encodings: ", mp_lik_vae.item())

Likelihood of scVI encodings:  -2826.691510321084


In [73]:
treevae.initialize_visit()
treevae.initialize_messages(cascvi_latent, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of cascVI encodings: ", mp_lik_cascvi.item())

Likelihood of cascVI encodings:  -369.6501111715011


In [74]:
treevae.initialize_visit()
treevae.initialize_messages(leaves_z, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of observations: ", mp_lik.item())

Likelihood of observations:  -222.3884709247602


### 2. k-nn purity

In [121]:
full_cascvi_latent = construct_latent(tree, cascvi_latent, imputed_z)
full_scvi_latent = construct_latent(tree, scvi_latent, imputed_avg_z)

In [143]:
print("Leaves Only")
data = {'groundtruth': leaves_z, 'scVI': scvi_latent,
        'cascVI': cascvi_latent
        }
scores = knn_purity(max_neighbors=30,
                    data=data,
                    plot=False,
                    save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/knn_purity_plot.png'
                    )
np.mean(scores['scVI']), np.mean(scores['cascVI'])

Leaves Only
No handles with labels found to put in legend.


(0.446926282876483, 0.5216776947687367)

In [204]:
df = pd.DataFrame()
df

In [199]:
neighbors = list(range(2, 30))
data = {'K': neighbors, 'scVI': scores['scVI'], 'cascVI': scores['cascVI']}
df2 = pd.DataFrame(data)

In [205]:
df = df.append(df2)
df

Unnamed: 0,K,scVI,cascVI
0,2,0.492537,0.532567
1,3,0.431981,0.507538
2,4,0.449275,0.520913
3,5,0.432665,0.536098
4,6,0.413428,0.540436
5,7,0.398601,0.526718
6,8,0.41218,0.52381
7,9,0.413983,0.505017
8,10,0.424501,0.500375
9,11,0.422107,0.503759


In [142]:
print("Internal nodes Only")
internal_z, internal_idx, internal_mu = get_internal(ppca.z, ppca.mu, tree)
internal_scvi_z, _, _ = get_internal(full_scvi_latent, ppca.mu, tree)
internal_cascvi_z, _, _ = get_internal(full_cascvi_latent, ppca.mu, tree)

data = {'groundtruth': internal_z, 'scVI': internal_scvi_z,
        'cascVI': internal_cascvi_z
        }

scores = knn_purity(max_neighbors=30,
              data=data,
              plot=False,
              save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/knn_purity.png'
              )

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Internal nodes Only
No handles with labels found to put in legend.


(0.36514020314439677, 0.45141649792859023)

In [141]:
print("Full tree")
data = {'groundtruth': ppca.z, 'scVI': full_scvi_latent,
        'cascVI': full_cascvi_latent
        }
scores = knn_purity(max_neighbors=30,
              data=data,
              plot=False)

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Full tree
No handles with labels found to put in legend.


(0.3584607764591899, 0.4458153596517597)