# Simulations results

This noteboook recreates the results reported on table 1 and 2 of the ""Reconstructing unobserved cellular states from  paired single-cell lineage tracing and transcriptomics data" paper.

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

In [2]:
def plot_boxplot(df_path):
    df_boxplot = pd.read_csv(df_path,
                            index_col=0
                            )
    fig, axes = plt.subplots(1, 3, figsize=(35, 8), sharey=True)

    sns.boxplot(ax=axes[0], data=df_boxplot, orient="v", palette="Set2", showfliers=False, x='Method', y='Spearman CC')
    axes[0].set_title("Spearman CC")

    sns.boxplot(ax=axes[1], data=df_boxplot, orient="v", palette="Set2", showfliers=False, x='Method', y='Pearson CC')
    axes[1].set_title("Pearson CC")

    sns.boxplot(ax=axes[2], data=df_boxplot, orient="v", palette="Set2", showfliers=False, x='Method', y='Kendall Tau')
    axes[2].set_title("Kendall Tau")

    plt.suptitle("Correlations", fontsize=16)

In [3]:
def table_correlation(df):
    data_dict = {}
    methods = np.unique(df['Method'].values)
    for method in methods:
        data_dict[method] = list(df[df.Method==method].mean())
    results_corr = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Spearman CC', 'Pearson CC', 'Kendal Tau CC'])
    return results_corr

# 1. Table 1: Gaussian Experiments (100 leaves, 10 trees, $\sigma^2=2$)

## 1.a.  Comparison of the posterior predictive densities of internal nodes

### 1.a.i MSE (L2 error) for the mean estimate on feature space

In [40]:
df_mse = pd.read_csv("/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/gaussian/lambda2.0/100/low_fitness/MSE")

print('Mean Squared Error')
df_mse.describe().iloc[1:3].drop(['Unnamed: 0'], axis=1)

Mean Squared Error


Unnamed: 0,average,gaussian VAE,gaussian treeVAE
mean,0.767643,0.862912,0.540534
std,0.032233,0.107996,0.068426


### 1.a.ii. Correlations (Pearson / Spearman) across genes

In [57]:
# data frame
df = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/gaussian/lambda2.0/100/low_fitness/correlations_ss')
df_corr = table_correlation(df.iloc[1:].drop(['Unnamed: 0'], axis=1))
df_corr = df_corr.drop(['Kendal Tau CC'], axis=1)

print('Gene-Gene correlations')
print(df_corr)

Gene-Gene correlations
                  Spearman CC  Pearson CC
average              0.802985    0.828405
gaussian VAE         0.820835    0.846101
gaussian treeVAE     0.844116    0.869268


### 1.a.iv MSE (L2 error) for the mean estimate on the *latent space*

In [58]:
df_var = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/gaussian/lambda2.0/100/low_fitness/MSE_mean')

print("MSE of variance estimate in latent space")
df_var.columns = ['Unnamed: 0', 'gaussian treeVAE', 'gaussian VAE']
df_var.drop(['Unnamed: 0'], axis=1).describe().iloc[1:3]

MSE of variance estimate in latent space


Unnamed: 0,gaussian treeVAE,gaussian VAE
mean,1.890859,2.284896
std,0.704215,0.719905


### 1.a.v. Cross-Entropy on the *latent space*

In [59]:
df_lik = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/gaussian/lambda2.0/100/low_fitness/Cross_Entropy')

df_lik.drop(['Unnamed: 0'], axis=1).describe().iloc[1:3]

Unnamed: 0,gaussian VAE,gaussian treeVAE
mean,-2515.5431,-281.015742
std,1396.938066,50.549052


### 1.a.vi k-NN purity with K=5 neighbors

In [30]:
df_purity = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/gaussian/lambda2.0/100/low_fitness/purity_full')
df_purity[df_purity.K == 5].drop(['Unnamed: 0'], axis=1).mean(), 

(K         5.000000
 scVI      0.372408
 cascVI    0.450439
 dtype: float64,)

# 2. Table 2: Poisson Experiments (500 leaves, 10 trees, binomial thinning=0.1)

## 2.a. Comparison of the prior predictive against the posterior predictive densities of internal nodes

### 2.a.i Correlations (Pearson / Spearman / Kendall) across genes for all methods

In [60]:
# data frame
df = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/poisson/lambda1.0/500/no_fitness/bin0.1/correlations_ss')
df_corr = table_correlation(df.iloc[1:].drop(['Unnamed: 0'], axis=1))
df_corr = df_corr.drop(['Kendal Tau CC'], axis=1)

print('Gene-Gene correlations')
print(df_corr)

Gene-Gene correlations
         Spearman CC  Pearson CC
Average     0.314034    0.350478
cascVI      0.326861    0.412718
scVI        0.324289    0.402297


***2.a.ii MSE in feature space***

In [61]:
df_mse = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/poisson/lambda1.0/500/no_fitness/bin0.1/MSE')
df_mse.describe().iloc[1:3].drop(['Unnamed: 0'], axis=1)

Unnamed: 0,cascVI,scVI,Average
mean,5800208.0,5813720.0,7527232.0
std,1603860.0,1388925.0,1368533.0


## 2.b Latent space metrics    

### 2.b.i k-NN purity with K=5 neighbors

In [28]:
df_purity = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/poisson/lambda1.000001/500/no_fitness/bin0.1/purity_full')
df_purity[df_purity.K == 5].drop(['Unnamed: 0'], axis=1).describe()[1:3], 

(        K      scVI    cascVI
 mean  5.0  0.523079  0.617259
 std   0.0  0.007732  0.006807,)

### 2.b.ii. Cross Entropy

In [62]:
df_ce = pd.read_csv('/home/eecs/khalil.ouardini/Cassiopeia_Transcriptome/scvi/results/poisson/lambda1.0/500/no_fitness/bin0.1/Cross_Entropy')
df_ce.drop(['Unnamed: 0'], axis=1).describe()[1:3]

Unnamed: 0,gaussian VAE,gaussian treeVAE
mean,-8481.557168,-1577.262068
std,1867.786217,646.187172
