# 0. Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('WebAgg')
import numpy as np
import pandas as pd

In [2]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/external


In [7]:
cd ..

/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi


***import ete3 Tree***

In [8]:
from ete3 import Tree

tree_name = "/home/eecs/khalil.ouardini/cas_scvi_topologies/newick_objects/500cells/no_fitness/topology0.nwk"
tree = Tree(tree_name, 1)

# Renaming nodes with levelorder indexing
for i, n in enumerate(tree.traverse('levelorder')):
    n.add_features(index=i)
    n.name = str(i)

k = 1
branch_length = {}
for node in tree.traverse('levelorder'):
    if node.name == '0':
        branch_length[node.name] = 0.0
        continue
    branch_length[node.name] = k * node.dist
branch_length['prior_root'] = 1.0

In [39]:
# Data
from anndata import AnnData
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from external.dataset.tree import TreeDataset, GeneExpressionDataset
from external.dataset.poisson_glm import Poisson_GLM
from external.dataset.anndataset import AnnDatasetFromAnnData

# Models
from external.models.vae import VAE
import scanpy as sc
from external.inference.tree_inference import TreeTrainer
from external.inference.inference import UnsupervisedTrainer
from external.models.treevae import TreeVAE

# Utils
from external.utils.data_util import get_leaves, get_internal
from external.utils.metrics import ks_pvalue, accuracy_imputation, correlations, knn_purity, knn_purity_stratified
from external.utils.plots_util import plot_histograms, plot_scatter_mean, plot_ecdf_ks, plot_density
from external.utils.plots_util import plot_losses, plot_elbo, plot_common_ancestor, plot_one_gene, training_dashboard
from external.utils.baselines import avg_weighted_baseline, scvi_baseline, scvi_baseline_z, cascvi_baseline_z, avg_baseline_z, construct_latent

In [None]:
import torch
    
np.random.seed(42)
torch.manual_seed(42)

# 1. Simulations (Poisson GLM)

In [10]:
d = 5
g = 100
vis = False
leaves_only = False
# Inverse dispersion parameter for negative binomial simulation
alpha=1.0

glm = Poisson_GLM(tree, g, d, vis, leaves_only, branch_length, alpha)

glm.simulate_latent()

***Generate gene expression count data***

In [15]:
glm.simulate_ge(negative_binomial=False)
# Quality Control (i.e Gene Filtering)
glm.gene_qc()

print("shape of full Gene expression matrix :{}".format(glm.X.shape))

shape of full Gene expression matrix :(1000, 99)


***Binomial thinning***

In [16]:
print("Proportion of dropouts before binomial thinning: {}".format(np.mean(glm.X == 0)))
glm.binomial_thinning(p=0.1)
print("Proportion of dropouts after binomial thinning: {}".format(np.mean(glm.X == 0)))

Proportion of dropouts before binomial thinning: 0.4025858585858586
Proportion of dropouts after binomial thinning: 0.8377474747474748


***Split dataset in leaves/internal nodes***

In [17]:
# Latent vectors
leaves_z, _, _ = get_leaves(glm.z, glm.mu, tree)

#FIXED training set
leaves_X, leaves_idx, mu = get_leaves(glm.X, glm.mu, tree)

# internal nodes data (for imputation)
internal_X, internal_idx, internal_mu = get_internal(glm.X, glm.mu, tree)

leaves_X.shape, mu.shape, internal_X.shape, internal_mu.shape, leaves_z.shape

((500, 99), (500, 99), (500, 99), (500, 99), (500, 5))

# 2. Fitting CascVI

In [18]:
import scanpy as sc

# anndata + gene and celle filtering
adata = AnnData(leaves_X)
leaves = [n for n in tree.traverse('levelorder') if n.is_leaf()]
adata.obs_names = [n.name for n in leaves]

***Create a TreeDataset object***

In [19]:
# treeVAE
import copy

tree_bis = copy.deepcopy(tree)
scvi_dataset = AnnDatasetFromAnnData(adata, filtering=False)
scvi_dataset.initialize_cell_attribute('barcodes', adata.obs_names)
cas_dataset = TreeDataset(scvi_dataset, tree=tree_bis, filtering=False)
cas_dataset

# No batches beacause of the message passing
use_cuda = True
use_MP = True
ldvae = False

go


***Initialize model***

In [20]:
treevae = TreeVAE(cas_dataset.nb_genes,
              tree = cas_dataset.tree,
              n_latent=glm.latent,
              n_hidden=128,
              n_layers=1,
              reconstruction_loss='poisson',
              prior_t = branch_length,
              ldvae = ldvae,
              use_MP=use_MP
             )

***Set freeze=True to set treeVAE decoder to the true linear decoder used in the simulations***

In [22]:
import torch

freeze = False
if freeze:
    new_weight = torch.from_numpy(glm.W).float()
    new_bias = torch.from_numpy(glm.beta).float()

    with torch.no_grad():
        treevae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
        treevae.decoder.factor_regressor.fc_layers[0][0].bias = torch.nn.Parameter(new_bias)
        
    for param in treevae.decoder.factor_regressor.fc_layers[0][0].parameters():
        param.requires_grad = False

***Hyperparameters***

In [24]:
n_epochs = 1000
lr = 1e-3
lambda_ = 1.0

***trainer***

In [25]:
freq = 100
trainer = TreeTrainer(
    model = treevae,
    gene_dataset = cas_dataset,
    lambda_ = lambda_,
    train_size=1.0,
    test_size=0,
    use_cuda=use_cuda,
    frequency=freq,
    n_epochs_kl_warmup=150
)

train_leaves:  [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], [151], [152], [153], [154], [155], [1

***Start training***

In [20]:
trainer.train(n_epochs=n_epochs,
              lr=lr
              )


training:  85%|████████▌ | 850/1000 [02:39<00:26,  5.74it/s]Encodings MP Likelihood: 2.7463123310357673
ELBO Loss: 336.73234241717694
training:  85%|████████▌ | 851/1000 [02:39<00:25,  5.77it/s]Encodings MP Likelihood: 3.099520838667402
ELBO Loss: 337.2308047584422
training:  85%|████████▌ | 852/1000 [02:39<00:25,  5.79it/s]Encodings MP Likelihood: 3.264432632881722
ELBO Loss: 337.422221495124
training:  85%|████████▌ | 853/1000 [02:40<00:25,  5.81it/s]Encodings MP Likelihood: 3.3498932028033472
ELBO Loss: 337.39394277172056
training:  85%|████████▌ | 854/1000 [02:40<00:25,  5.83it/s]Encodings MP Likelihood: 2.73421659805315
ELBO Loss: 336.66126181040016
training:  86%|████████▌ | 855/1000 [02:40<00:24,  5.84it/s]Encodings MP Likelihood: 3.4467494695228797
ELBO Loss: 337.78747749460524
training:  86%|████████▌ | 856/1000 [02:40<00:24,  5.84it/s]Encodings MP Likelihood: 2.9927994920736394
ELBO Loss: 337.5025317505975
training:  86%|████████▌ | 857/1000 [02:40<00:24,  5.84it/s]Encodings

***Loss Functions***

In [21]:
training_dashboard(trainer, treevae.encoder_variance)

### 3. Posterior and MV imputation

In [22]:
from sklearn.metrics import mean_squared_error

full_posterior = trainer.create_posterior(trainer.model, cas_dataset, trainer.clades,
                                indices=np.arange(len(cas_dataset))
                                         )
error = mean_squared_error(full_posterior.get_latent(), leaves_z)
print("the distance is {}".format(error))

the distance is 1.541032366102065


***Missing Value imputation By Posterior Predictive sampling***

In [23]:
empirical_l = np.mean(np.sum(glm.X, axis=1))

# CascVI impitations
imputed = {}
imputed_z = {}
imputed_mcmc_cov = {}
imputed_gt = {}

for n in tree.traverse('levelorder'):
    if not n.is_leaf():
        imputed[n.name], imputed_z[n.name] = full_posterior.imputation_internal(n,
                                                            give_mean=False,
                                                            library_size=empirical_l
                                                           )
        _, imputed_mcmc_cov[n.name] = full_posterior.mcmc_estimate(query_node=n,
                                                                    n_samples=20
                                                                    )
        imputed_gt[n.name] = glm.X[n.index]

In [24]:
imputed_X = [x for x in imputed.values()]
imputed_X = np.array(imputed_X).reshape(-1, cas_dataset.X.shape[1])
#plot_histograms(imputed_X, "Histogram of CasscVI imputed gene expression data")

***CascVI Baseline 1 (MP Oracle)***

In [25]:
# CascVI impitations
#imputed_cascvi_1 = {}
#imputed_cascvi_1_z ={}

#for n in tree.traverse('levelorder'):
#    if not n.is_leaf():
#        _, imputed_cascvi_1_z[n.name] = full_posterior.imputation_internal(n,
#                                                                    give_mean=False,
#                                                                    library_size=empirical_l,
#                                                                    known_latent=leaves_z
#        )
#        mu_z = np.clip(a=np.exp(glm.W @ imputed_cascvi_1_z[n.name].cpu().numpy() + glm.beta),
#                        a_min=0,
#                        a_max=1e8
#                        )
#        samples = np.array([np.random.poisson(mu_z) for i in range(100)])
#        imputed_cascvi_1[n.name] = np.clip(a=np.mean(samples, axis=0),
#                                           a_min=0,
#                                           a_max=1e8
#                                           )


***CascVI Baseline 2 (Reconstruction of Averaged latent space)***

In [26]:
#imputed_cascvi_2, imputed_cascvi_2_z = avg_baseline_z(tree=tree,
#                                   model=treevae,
#                                   posterior=full_posterior,
#                                   weighted=False,
#                                   n_samples_z=1,
#                                   library_size=empirical_l,
#                                   gaussian=False,
#                                   use_cuda=True
#                                  )

In [27]:
full_posterior.compute_elbo()

tensor(326.9408, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)

# 4. Baselines

### Baseline 1: Unweighted Average of gene expression in Clade

The simple idea here is to impute the value of an internal node, with the (un)weighted average of the gene expression values of the leaves, taking the query internal node as the root of the subtree.

In [28]:
weighted = False
imputed_avg = avg_weighted_baseline(tree, weighted, glm.X, rounding=True)

#get internal nodes
avg_X = np.array([x for x in imputed_avg.values()]).reshape(-1, glm.X.shape[1])
internal_avg_X, _, _ = get_internal(avg_X, glm.mu, tree)

### Baseline 2: (Un)weighted Average of decoded latent vectors, with scVI

We use the same averaging of the subtrees leaves in **Baseline 1**, only this time, the gene expression data is recovered with scVI

In [40]:
# anndata
gene_dataset = GeneExpressionDataset()
gene_dataset.populate_from_data(leaves_X)

In [41]:
import torch

n_epochs =500
use_batches = False

vae = VAE(gene_dataset.nb_genes,
                  n_batch=cas_dataset.n_batches * use_batches,
                  n_hidden=128,
                  n_layers=1,
                  reconstruction_loss='poisson',
                  n_latent=glm.latent,
                  ldvae=ldvae
              )

***Set freeze=True to set treeVAE decoder to the true linear decoder used in the simulations***

In [42]:
if freeze:
    new_weight = torch.from_numpy(glm.W).float()
    new_bias = torch.from_numpy(glm.beta).float()

    with torch.no_grad():
        vae.decoder.factor_regressor.fc_layers[0][0].weight = torch.nn.Parameter(new_weight)
        vae.decoder.factor_regressor.fc_layers[0][0].bias = torch.nn.Parameter(new_bias)
        
    for param in vae.decoder.factor_regressor.fc_layers[0][0].parameters():
        param.requires_grad = False

In [43]:
trainer_scvi = UnsupervisedTrainer(model=vae,
                              gene_dataset=gene_dataset,
                              train_size=1.0,
                              use_cuda=use_cuda,
                              frequency=10,
                              n_epochs_kl_warmup=None)


In [44]:
# train scVI
trainer_scvi.train(n_epochs=n_epochs, lr=1e-3) 
                                        
elbo_train_scvi = trainer_scvi.history["elbo_train_set"]
x = np.linspace(0, 100, (len(elbo_train_scvi)))
plt.plot(np.log(elbo_train_scvi), 
         label="train", color='blue',
         linestyle=':',
         linewidth=3
        )
        
plt.xlabel('Epoch')
plt.ylabel("ELBO")
plt.legend()
plt.title("Train history scVI")
plt.show()

training: 100%|██████████| 500/500 [00:34<00:00, 14.53it/s]


In [46]:
scvi_posterior = trainer_scvi.create_posterior()

scvi_posterior.elbo()

46.16246484375

***scVI Baseline 2 (Decoded Average Latent space)***

In [47]:
library_size = np.mean(np.sum(glm.X, axis=1))
scvi_latent = np.array([scvi_posterior.get_latent(give_mean=False)[0] for i in range(10)])

imputed_scvi_2, imputed_scvi_2_z = scvi_baseline_z(tree,
                                        posterior=scvi_posterior,
                                        model=vae,
                                        weighted=False,
                                        n_samples_z=1,
                                        library_size=library_size,
                                        use_cuda=True)


# 5. Likelihood Ratio

In [50]:
#cascvi_latent = full_posterior.get_latent()
scvi_latent = scvi_posterior.get_latent()[0]

scvi_latent.shape#, cascvi_latent.shape

(500, 5)

In [51]:
treevae.initialize_visit()
treevae.initialize_messages(scvi_latent, cas_dataset.barcodes, scvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), scvi_latent.shape[1], False)
mp_lik_scvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of scVI encodings: ", mp_lik_scvi.item())

Likelihood of scVI encodings:  -15505.747668543649


In [38]:
treevae.initialize_visit()
treevae.initialize_messages(cascvi_latent, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of cascVI encodings: ", mp_lik_cascvi.item())

Likelihood of cascVI encodings:  -518.9174727306946


In [39]:
treevae.initialize_visit()
treevae.initialize_messages(leaves_z, cas_dataset.barcodes, cascvi_latent.shape[1])
treevae.perform_message_passing((treevae.tree & treevae.root), cascvi_latent.shape[1], False)
mp_lik_cascvi = treevae.aggregate_messages_into_leaves_likelihood(d, add_prior=True)
print("Likelihood of observations: ", mp_lik_cascvi.item())

Likelihood of observations:  -1321.6885433836942


In [40]:
# Likelihood ratio
lambda_ = (mp_lik_cascvi - mp_lik_scvi)
print("Likelihood Ratio:", lambda_)

Likelihood Ratio: tensor(22892.6387, dtype=torch.float64)


# Evaluation Output

***Leaves variance cascVI***

In [41]:
qz_v_cascvi = full_posterior.empirical_qz_v(n_samples=100,
                                    norm=True
                                    )

***Leaves variance scVI***

In [42]:
latent = []
for n in range(100):
    latent.append(scvi_posterior.get_latent(give_mean=False)[0])
latent = np.array(latent)

qz_v_scvi = np.var(latent,
                axis=0,
                dtype=np.float64)

qz_v_scvi = [np.linalg.norm(v) for v in qz_v_scvi]

In [43]:
reconstructed_leaves = np.mean(full_posterior.generate_leaves(), axis=0)

In [44]:
reconstructed = scvi_posterior.generate()[0][:, :, 0]
reconstructed.shape

(500, 991)

In [45]:
reconstructed.shape

(500, 991)

reconstructed leaves

In [156]:
from sklearn.metrics import mean_squared_error as mse

output = []
columns = ['node ID', 'depth', 'MSE cascVI', 'MSE scVI', 'avg MSE', 'variance cascVI']
idx=0
idx_leaf = 0
for n in tree.traverse('levelorder'):
    depth = n.get_distance(tree)
    if not n.is_leaf():
        mse_scvi = mse(norm_scvi_X_2[idx], norm_internal_X[idx])
        mse_cascvi = mse(norm_imputed_X[idx], norm_internal_X[idx])
        mse_avg = mse(norm_avg_X[idx], norm_internal_X[idx])
        output.append([n.name, depth, mse_cascvi, mse_scvi, mse_avg, np.linalg.norm(imputed_mcmc_cov[n.name])])
        idx += 1
    else:
        mse_cascvi = mse(reconstructed_leaves[idx_leaf], leaves_X[idx_leaf])
        mse_scvi = mse(reconstructed[idx_leaf], leaves_X[idx_leaf])
        output.append([n.name, depth, mse_cascvi, mse_scvi, 0, qz_v_cascvi[idx_leaf]])
        idx_leaf += 1
df_cascvi = pd.DataFrame(data=output, columns=columns)

In [1]:
df_cascvi[900:]

NameError: name 'df_cascvi' is not defined

In [161]:
df_cascvi.to_csv('poisson_stats.txt')

# 6. Evaluation

***CPM Normalization (for sample-sample correlation)***

get imputations into an array

In [142]:
internal_scvi_X_2 = np.array([x for x in imputed_scvi_2.values()]).reshape(-1, glm.X.shape[1])
#internal_cascvi_X = np.array([x for x in imputed_cascvi_1.values()]).reshape(-1, glm.X.shape[1])
#internal_cascvi_X_2 = np.array([x for x in imputed_cascvi_2.values()]).reshape(-1, glm.X.shape[1])

#internal_cascvi_X_2.shape, internal_scvi_X_2.shape, imputed_X.shape, internal_avg_X.shape, internal_X.shape

In [143]:
from sklearn.preprocessing import normalize

norm_internal_X = sc.pp.normalize_total(AnnData(internal_X), target_sum=1e6, inplace=False)['X'] 
norm_scvi_X_2 = sc.pp.normalize_total(AnnData(internal_scvi_X_2), target_sum=1e6, inplace=False)['X']
norm_avg_X = sc.pp.normalize_total(AnnData(internal_avg_X), target_sum=1e6, inplace=False)['X']
norm_imputed_X = sc.pp.normalize_total(AnnData(imputed_X), target_sum=1e6, inplace=False)['X']
#norm_cascvi_X = sc.pp.normalize_total(AnnData(internal_cascvi_X), target_sum=1e6, inplace=False)['X']
#norm_cascvi_X_2 = sc.pp.normalize_total(AnnData(internal_cascvi_X_2), target_sum=1e6, inplace=False)['X']

norm_internal_X.shape

(500, 998)

## I. Sample-Sample Correlations

***1. Sample-Sample correlation (Without Normalization)***

We will use Scipy to compute a nonparametric rank correlation between the imputed and the groundtruth profiles. The correlation is based on the Spearman Correlation Coefficient.

In [71]:
data = {'groundtruth': internal_X.T, 'cascVI': imputed_X.T, 'scVI': internal_scvi_X_2.T,
        'Average': internal_avg_X.T , 'cascVI + Avg': internal_cascvi_X_2.T}
        #'MP Oracle': internal_cascvi_X.T
        #}
df1 = correlations(data, 'None', True)
#df1.head(5)
#plt.show()

***2. Sample-Sample correlation (With ScanPy Normalization)***

In [72]:
data = {'groundtruth': norm_internal_X.T, 'cascVI': norm_imputed_X.T, 'scVI': norm_scvi_X_2.T, 
        'Average': norm_avg_X.T , 'cascVI + Avg': norm_cascvi_X_2.T}
        #'MP Oracle': norm_cascvi_X.T
        #}

df2 = correlations(data, 'None', True)
#df2.head(5)
#plt.show()


## II. Gene-Gene Correlations

***2. Gene-Gene correlation (With Normalization)***

In [73]:
data = {'groundtruth': internal_X, 'cascVI': imputed_X, 'scVI': internal_scvi_X_2,
        'Average': internal_avg_X , 'cascVI + Avg': internal_cascvi_X_2}
        #,
        #'MP Oracle': internal_cascvi_X
        #}

df3 = correlations(data, 'None', True)
#df3.head(5)
#plt.show()



***2. Gene-Gene correlation (With Normalization)***

In [74]:
data = {'groundtruth': norm_internal_X, 'cascVI': norm_imputed_X, 'scVI': norm_scvi_X_2, 
        'Average': norm_avg_X , 'cascVI + Avg': norm_cascvi_X_2}
        #'MP Oracle': norm_cascvi_X
        #}

df4 = correlations(data, 'None', True)
#df4.head(5)
#plt.show()



***3. Gene-Gene correlation (With Rank Normalization)***

In [75]:
#data = {'groundtruth': norm_internal_X, 'cascVI': norm_imputed_X, 'scVI': norm_scvi_X_2, 
#        'Average': norm_avg_X , 'cascVI + Avg': norm_cascvi_X_2,
#        'MP Oracle': norm_cascvi_X
#        }

data = {'groundtruth': internal_X, 'cascVI': imputed_X, 'scVI': internal_scvi_X_2,
        'Average': internal_avg_X , 'cascVI + Avg': internal_cascvi_X_2}
        #'MP Oracle': internal_cascvi_X
        #}
        
df5 = correlations(data, 'rank', True)
#df5.head(5)
#plt.show()



### III. Table Summary

In [76]:
columns = ["Method", "Spearman CC", "Pearson CC", "Kendall Tau"]
data = [df1, df2, df3, df4, df5]
#data = [df2, df4]

data 
tables = [[] for i in range(len(data))]

#task = ["Sample-Sample (None)", "Sample-Sample (CPM)", "Gene-Gene (None)", 
           #"Gene-Gene(CPM)", "Gene-Gene (Rank)" ]

for (df, t) in zip(data, tables):
    for m in np.unique(df.Method):
        sub_df = np.round(df[df['Method'] == m].mean(), decimals=3)
        t.append([m, sub_df['Spearman CC'], sub_df['Pearson CC'], sub_df['Kendall Tau']])
        
# Create and style Data Frames
df_table1 = pd.DataFrame(tables[0], columns=columns)
df_table2 = pd.DataFrame(tables[1], columns=columns)
df_table3 = pd.DataFrame(tables[2], columns=columns)
df_table4 = pd.DataFrame(tables[3], columns=columns)
df_table5 = pd.DataFrame(tables[4], columns=columns)

In [77]:
print(" >>> Sample-Sample | No Normalization <<<")
df_table1.head(10)

 >>> Sample-Sample | No Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.497,0.814,0.479
1,cascVI,0.499,0.845,0.415
2,cascVI + Avg,0.493,0.837,0.415
3,scVI,0.487,0.83,0.41


In [78]:
print(">>> Sample-Sample | CPM Normalization <<<")
df_table2.head(10)

>>> Sample-Sample | CPM Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.497,0.814,0.479
1,cascVI,0.499,0.845,0.415
2,cascVI + Avg,0.493,0.837,0.415
3,scVI,0.487,0.83,0.41


In [79]:
print(">>> Gene-Gene | No Normalization <<<")
df_table3.head(10)

>>> Gene-Gene | No Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.254,0.322,0.238
1,cascVI,0.289,0.353,0.235
2,cascVI + Avg,0.28,0.34,0.23
3,scVI,0.274,0.33,0.226


In [80]:
print(">>> Gene-Gene | CPM Normalization <<<")
df_table4.head(10)

>>> Gene-Gene | CPM Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.238,0.268,0.213
1,cascVI,0.296,0.379,0.232
2,cascVI + Avg,0.288,0.367,0.227
3,scVI,0.281,0.353,0.222


In [87]:
print(">>> Gene-Gene | Rank Normalization <<<")
df_table5.head(10)

>>> Gene-Gene | Rank Normalization <<<


Unnamed: 0,Method,Spearman CC,Pearson CC,Kendall Tau
0,Average,0.254,0.254,0.238
1,cascVI,0.289,0.289,0.235
2,cascVI + Avg,0.28,0.28,0.23
3,scVI,0.274,0.274,0.226


# 8. Latent Space Analysis

### k-NN purity

***LEAVES only***

In [91]:
print("Leaves Only")
data = {'groundtruth': leaves_z, 'scVI': scvi_latent,
        'cascVI': cascvi_latent
        }
scores = knn_purity(max_neighbors=30,
                    data=data,
                    plot=False,
                    save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/tmp_purtiy.png'
                    )

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Leaves Only


(0.4698933815753866, 0.49297910553113355)

*** Internal nodes only***

In [92]:
print("Internal nodes Only")

full_cascvi_latent = construct_latent(tree, cascvi_latent, imputed_z)
full_scvi_latent = construct_latent(tree, scvi_latent, imputed_scvi_2_z)

internal_z, internal_idx, internal_mu = get_internal(glm.z, glm.mu, tree)
internal_scvi_z, _, _ = get_internal(full_scvi_latent, glm.mu, tree)
internal_cascvi_z, _, _ = get_internal(full_cascvi_latent, glm.mu, tree)

data = {'groundtruth': internal_z, 'scVI': internal_scvi_z,
        'cascVI': internal_cascvi_z
        }

scores = knn_purity(max_neighbors=30,
                    data=data,
                    plot=False,
                    save_fig='/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/tmp_purtiy.png'
                    )

np.mean(scores['scVI']), np.mean(scores['cascVI'])

Internal nodes Only


(0.44587087695578187, 0.5250488645477123)

***Full tree***

In [94]:
print("Full tree")
data = {'groundtruth': glm.z, 'scVI': full_scvi_latent,
        'cascVI': full_cascvi_latent
        }

scores = knn_purity(max_neighbors=30,
              data=data,
              plot=True)

np.mean(scores['scVI']), np.mean(scores['cascVI'])  

Full tree


(0.45329867389094985, 0.5212643774644873)