In [1]:
import numpy as np, pandas as pd
from scipy.stats import trim_mean
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
import warnings, numpy as np, scanpy as sc, pandas as pd
warnings.filterwarnings("ignore")
from tqdm.auto import tqdm

def calculate_centroids(X, labels):
    centroids = dict()
    for label in labels.unique():
        centroids[label] = np.mean(X[labels == label], axis=0)
    return centroids


def calculate_trimmed_means(X, labels, trim_proportion=0.2, ignore_=[]):
    centroids = dict()
    if isinstance(X, csr_matrix):
        X = X.toarray()
    for label in labels.unique():
        if label in ignore_:
            continue
        centroids[label] = trim_mean(X[labels == label], proportiontocut=trim_proportion, axis=0)
    return centroids


def compute_classwise_distances(centroids):
    centroid_vectors = np.array([centroids[key] for key in sorted(centroids.keys())])
    distances = cdist(centroid_vectors, centroid_vectors, "euclidean")
    return pd.DataFrame(distances, columns=sorted(centroids.keys()), index=sorted(centroids.keys()))

In [2]:
def corr_diff(df1, df2):
    pearson_corr = {}
    for col in df1.columns:
        paired_non_nan = pd.concat([df1[col], df2[col]], axis=1).dropna()
        pearson_corr[col] = paired_non_nan.iloc[:, 0].corr(paired_non_nan.iloc[:, 1], method="pearson")
    return pd.DataFrame.from_dict(pearson_corr, orient="index", columns=["Pearson Correlation"])

In [3]:
from scipy.io import mmread
import anndata as an

# E12

In [3]:
cyto = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E12f_CytO.h5ad')
spla = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E12f_splatter.h5ad')
simp = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E12f_splatter_simple.h5ad')
scde = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E12f_scDesign3.h5ad')
scva = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E12f_scVAEDer.h5ad')
real = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/Zygote_to_CS7.h5ad')

In [4]:
real = real[real.obs.time.isin (['E12','E10','E14'])]
real.var = real.var.rename(columns = {'x':'features'})
real.var.index = real.var.features

In [5]:
adata_int = sc.concat([cyto,spla,simp,scde,scva],
                      join = 'outer',
                      fill_value = 0)

adata = sc.concat([real,adata_int],
                 label = 'source',
                 keys = ['real','simulated'],
                 index_unique = '-')

In [6]:
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)



In [7]:
adata_full = adata.copy()

In [8]:
adata = adata_full[adata_full.obs.time.isin(['E12','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_real = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E12-CytOrigin','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_cyto = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E12-Splatter','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_spla = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E12-Splatter-Simple','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_simp = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E12-scDesign3','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scde = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E12-scVAEDer','E14','E10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scva = compute_classwise_distances(centroids_pca)



In [9]:
pca_pairdist_cyto.index = ['E10','E12','E14']
pca_pairdist_cyto.columns = ['E10','E12','E14']

pca_pairdist_scde.index = ['E10','E12','E14']
pca_pairdist_scde.columns = ['E10','E12','E14']

pca_pairdist_scva.index = ['E10','E12','E14']
pca_pairdist_scva.columns = ['E10','E12','E14']

pca_pairdist_spla.index = ['E10','E12','E14']
pca_pairdist_spla.columns = ['E10','E12','E14']

pca_pairdist_simp.index = ['E10','E12','E14']
pca_pairdist_simp.columns = ['E10','E12','E14']

In [10]:
index_pearson_cyto = corr_diff(pca_pairdist_real, pca_pairdist_cyto)
index_pearson_scva = corr_diff(pca_pairdist_real, pca_pairdist_scva)
index_pearson_scde = corr_diff(pca_pairdist_real, pca_pairdist_scde)
index_pearson_spla = corr_diff(pca_pairdist_real, pca_pairdist_spla)
index_pearson_simp = corr_diff(pca_pairdist_real, pca_pairdist_simp)

In [11]:
index_pearson = pd.concat([index_pearson_cyto,index_pearson_scva,index_pearson_scde,index_pearson_spla,index_pearson_simp], axis = 1)
index_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
index_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/E12_index_pearson.csv')

In [12]:
pca_relative_real = pd.DataFrame.from_dict({'E10': pca_pairdist_real['E12']['E10']/pca_pairdist_real['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_real['E12']['E14']/pca_pairdist_real['E14']['E10']},
                                           orient='index', columns = ['E12'])
pca_relative_cyto = pd.DataFrame.from_dict({'E10': pca_pairdist_cyto['E12']['E10']/pca_pairdist_cyto['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_cyto['E12']['E14']/pca_pairdist_cyto['E14']['E10']},
                                           orient='index', columns = ['E12'])
pca_relative_scva = pd.DataFrame.from_dict({'E10': pca_pairdist_scva['E12']['E10']/pca_pairdist_scva['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_scva['E12']['E14']/pca_pairdist_scva['E14']['E10']},
                                           orient='index', columns = ['E12'])
pca_relative_scde = pd.DataFrame.from_dict({'E10': pca_pairdist_scde['E12']['E10']/pca_pairdist_scde['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_scde['E12']['E14']/pca_pairdist_scde['E14']['E10']},
                                           orient='index', columns = ['E12'])
pca_relative_spla = pd.DataFrame.from_dict({'E10': pca_pairdist_spla['E12']['E10']/pca_pairdist_spla['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_spla['E12']['E14']/pca_pairdist_spla['E14']['E10']},
                                           orient='index', columns = ['E12'])
pca_relative_simp = pd.DataFrame.from_dict({'E10': pca_pairdist_simp['E12']['E10']/pca_pairdist_simp['E14']['E10'],
                                            'E12': 0,
                                            'E14': pca_pairdist_simp['E12']['E14']/pca_pairdist_simp['E14']['E10']},
                                           orient='index', columns = ['E12'])

In [13]:
rela_pearson_cyto = corr_diff(pca_relative_real, pca_relative_cyto)
rela_pearson_scva = corr_diff(pca_relative_real, pca_relative_scva)
rela_pearson_scde = corr_diff(pca_relative_real, pca_relative_scde)
rela_pearson_spla = corr_diff(pca_relative_real, pca_relative_spla)
rela_pearson_simp = corr_diff(pca_relative_real, pca_relative_simp)

In [16]:
rela_pearson = pd.concat([rela_pearson_cyto,rela_pearson_scva,rela_pearson_scde,rela_pearson_spla,rela_pearson_simp], axis = 1)
rela_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
rela_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/E12_relative_pearson.csv')

# E14

In [122]:
cyto = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E14f_CytO.h5ad')
spla = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E14f_splatter.h5ad')
simp = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E14f_splatter_simple.h5ad')
scde = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E14f_scDesign3.h5ad')
scva = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/E14f_scVAEDer.h5ad')
real = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/Zygote_to_CS7.h5ad')

In [124]:
real = real[real.obs.time.isin (['E12','CS7','E14'])]
real.var = real.var.rename(columns = {'x':'features'})
real.var.index = real.var.features

In [125]:
adata_int = sc.concat([cyto,spla,simp,scde,scva],
                      join = 'outer',
                      fill_value = 0)

adata_full = sc.concat([real,adata_int],
                       label = 'source',
                       keys = ['real','simulated'],
                       index_unique = '-')

In [127]:
adata = adata_full[adata_full.obs.time.isin(['E14','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_real = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E14-CytOrigin','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_cyto = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E14-Splatter','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_spla = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E14-Splatter-Simple','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_simp = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E14-scDesign3','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scde = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['E14-scVAEDer','CS7','E12'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scva = compute_classwise_distances(centroids_pca)



In [None]:
pca_pairdist_cyto.index = ['CS7','E12','E14']
pca_pairdist_cyto.columns = ['CS7','E12','E14']

pca_pairdist_scde.index = ['CS7','E12','E14']
pca_pairdist_scde.columns = ['CS7','E12','E14']

pca_pairdist_scva.index = ['CS7','E12','E14']
pca_pairdist_scva.columns = ['CS7','E12','E14']

pca_pairdist_spla.index = ['CS7','E12','E14']
pca_pairdist_spla.columns = ['CS7','E12','E14']

pca_pairdist_simp.index = ['CS7','E12','E14']
pca_pairdist_simp.columns = ['CS7','E12','E14']

In [None]:
index_pearson_cyto = corr_diff(pca_pairdist_real, pca_pairdist_cyto)
index_pearson_scva = corr_diff(pca_pairdist_real, pca_pairdist_scva)
index_pearson_scde = corr_diff(pca_pairdist_real, pca_pairdist_scde)
index_pearson_spla = corr_diff(pca_pairdist_real, pca_pairdist_spla)
index_pearson_simp = corr_diff(pca_pairdist_real, pca_pairdist_simp)

In [None]:
index_pearson = pd.concat([index_pearson_cyto,index_pearson_scva,index_pearson_scde,index_pearson_spla,index_pearson_simp], axis = 1)
index_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
index_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/E14_index_pearson.csv')

Unnamed: 0,CytOrigin,scVAEDer,scDesign3,Splatter,Splatter-Simple
CS10,0.411863,0.393602,-0.007683,0.98413,0.999602
CS11,0.773717,0.233569,0.787159,0.996481,0.999859
CS9,0.960968,0.939361,0.791428,0.96634,0.999933


In [136]:
pca_relative_real = pd.DataFrame.from_dict({'E12': pca_pairdist_real['E14']['E12']/pca_pairdist_real['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_real['E14']['CS7']/pca_pairdist_real['CS7']['E12']},
                                           orient='index', columns = ['E14'])
pca_relative_cyto = pd.DataFrame.from_dict({'E12': pca_pairdist_cyto['E14']['E12']/pca_pairdist_cyto['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_cyto['E14']['CS7']/pca_pairdist_cyto['CS7']['E12']},
                                           orient='index', columns = ['E14'])
pca_relative_scva = pd.DataFrame.from_dict({'E12': pca_pairdist_scva['E14']['E12']/pca_pairdist_scva['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_scva['E14']['CS7']/pca_pairdist_scva['CS7']['E12']},
                                           orient='index', columns = ['E14'])
pca_relative_scde = pd.DataFrame.from_dict({'E12': pca_pairdist_scde['E14']['E12']/pca_pairdist_scde['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_scde['E14']['CS7']/pca_pairdist_scde['CS7']['E12']},
                                           orient='index', columns = ['E14'])
pca_relative_spla = pd.DataFrame.from_dict({'E12': pca_pairdist_spla['E14']['E12']/pca_pairdist_spla['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_spla['E14']['CS7']/pca_pairdist_spla['CS7']['E12']},
                                           orient='index', columns = ['E14'])
pca_relative_simp = pd.DataFrame.from_dict({'E12': pca_pairdist_simp['E14']['E12']/pca_pairdist_simp['CS7']['E12'],
                                            'E14': 0,
                                            'CS7': pca_pairdist_simp['E14']['CS7']/pca_pairdist_simp['CS7']['E12']},
                                           orient='index', columns = ['E14'])

In [137]:
rela_pearson_cyto = corr_diff(pca_relative_real, pca_relative_cyto)
rela_pearson_scva = corr_diff(pca_relative_real, pca_relative_scva)
rela_pearson_scde = corr_diff(pca_relative_real, pca_relative_scde)
rela_pearson_spla = corr_diff(pca_relative_real, pca_relative_spla)
rela_pearson_simp = corr_diff(pca_relative_real, pca_relative_simp)

In [141]:
rela_pearson = pd.concat([rela_pearson_cyto,rela_pearson_scva,rela_pearson_scde,rela_pearson_spla,rela_pearson_simp], axis = 1)
rela_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
rela_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/E14_relative_pearson.csv')

# CS10

In [4]:
cyto = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS10f_CytO.h5ad')
spla = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS10f_splatter.h5ad')
simp = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS10f_splatter_simple.h5ad')
scde = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS10f_scDesign3.h5ad')
scva = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS10f_scVAEDer.h5ad')
#real1 = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/CS9.h5ad')
real2 = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/CS10.h5ad')
#real3 = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/CS11.h5ad')

In [5]:
real2.var = real2.var.rename(columns = {'x':'features'})
real2.var.index = real2.var.features
real2.obs['time'] = 'CS10'

In [6]:
count_matrix = mmread('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS9_CS11_2CS10_counts.mtx').T
barcodes = pd.read_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS9_CS11_2CS10_cell.csv', index_col = 0)
gene_ids = pd.read_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS9_CS11_2CS10_gene.csv', index_col = 0)
real1 = an.AnnData(X = count_matrix.toarray(),
                   obs = barcodes,
                   var = gene_ids)

In [16]:
sc.pp.normalize_total(real1, target_sum=1e4)
sc.pp.log1p(real1)

In [17]:
real = sc.concat([real1,real2],
                      join = 'outer',
                      fill_value = 0)

In [18]:
adata_int = sc.concat([cyto,spla,simp,scde,scva],
                      join = 'outer',
                      fill_value = 0)

adata_full = sc.concat([real,adata_int],
                       label = 'source',
                       keys = ['real','simulated'],
                       index_unique = '-')

In [28]:
adata_full

View of AnnData object with n_obs × n_vars = 120254 × 8000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'cell_type', 'time', 'merge_type', 'source'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    obsm: 'X_pca', 'X_umap'

In [27]:
sc.pp.highly_variable_genes(adata_full, n_top_genes = 8000)
adata_full = adata_full[:, adata_full.var.highly_variable]

In [29]:
adata = adata_full[adata_full.obs.time.isin(['CS10','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_real = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS10-CytOrigin','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_cyto = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS10-Splatter','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_spla = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS10-Splatter-Simple','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_simp = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS10-scDesign3','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scde = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS10-scVAEDer','CS11','CS9'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scva = compute_classwise_distances(centroids_pca)



In [30]:
pca_pairdist_cyto.index = ['CS10','CS11','CS9']
pca_pairdist_cyto.columns = ['CS10','CS11','CS9']

pca_pairdist_scde.index = ['CS10','CS11','CS9']
pca_pairdist_scde.columns = ['CS10','CS11','CS9']

pca_pairdist_scva.index = ['CS10','CS11','CS9']
pca_pairdist_scva.columns = ['CS10','CS11','CS9']

pca_pairdist_spla.index = ['CS10','CS11','CS9']
pca_pairdist_spla.columns = ['CS10','CS11','CS9']

pca_pairdist_simp.index = ['CS10','CS11','CS9']
pca_pairdist_simp.columns = ['CS10','CS11','CS9']

In [31]:
index_pearson_cyto = corr_diff(pca_pairdist_real, pca_pairdist_cyto)
index_pearson_scva = corr_diff(pca_pairdist_real, pca_pairdist_scva)
index_pearson_scde = corr_diff(pca_pairdist_real, pca_pairdist_scde)
index_pearson_spla = corr_diff(pca_pairdist_real, pca_pairdist_spla)
index_pearson_simp = corr_diff(pca_pairdist_real, pca_pairdist_simp)

In [36]:
index_pearson = pd.concat([index_pearson_cyto,index_pearson_scva,index_pearson_scde,index_pearson_spla,index_pearson_simp], axis = 1)
index_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
index_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/CS10_index_pearson.csv') #8kgene

In [33]:
pca_relative_real = pd.DataFrame.from_dict({'CS9': pca_pairdist_real['CS10']['CS9']/pca_pairdist_real['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_real['CS10']['CS11']/pca_pairdist_real['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])
pca_relative_cyto = pd.DataFrame.from_dict({'CS9': pca_pairdist_cyto['CS10']['CS9']/pca_pairdist_cyto['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_cyto['CS10']['CS11']/pca_pairdist_cyto['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])
pca_relative_scva = pd.DataFrame.from_dict({'CS9': pca_pairdist_scva['CS10']['CS9']/pca_pairdist_scva['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_scva['CS10']['CS11']/pca_pairdist_scva['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])
pca_relative_scde = pd.DataFrame.from_dict({'CS9': pca_pairdist_scde['CS10']['CS9']/pca_pairdist_scde['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_scde['CS10']['CS11']/pca_pairdist_scde['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])
pca_relative_spla = pd.DataFrame.from_dict({'CS9': pca_pairdist_spla['CS10']['CS9']/pca_pairdist_spla['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_spla['CS10']['CS11']/pca_pairdist_spla['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])
pca_relative_simp = pd.DataFrame.from_dict({'CS9': pca_pairdist_simp['CS10']['CS9']/pca_pairdist_simp['CS11']['CS9'],
                                            'CS10': 0,
                                            'CS11': pca_pairdist_simp['CS10']['CS11']/pca_pairdist_simp['CS11']['CS9']},
                                           orient='index', columns = ['CS10'])

In [34]:
rela_pearson_cyto = corr_diff(pca_relative_real, pca_relative_cyto)
rela_pearson_scva = corr_diff(pca_relative_real, pca_relative_scva)
rela_pearson_scde = corr_diff(pca_relative_real, pca_relative_scde)
rela_pearson_spla = corr_diff(pca_relative_real, pca_relative_spla)
rela_pearson_simp = corr_diff(pca_relative_real, pca_relative_simp)

In [37]:
rela_pearson = pd.concat([rela_pearson_cyto,rela_pearson_scva,rela_pearson_scde,rela_pearson_spla,rela_pearson_simp], axis = 1)
rela_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
rela_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/CS10_relative_pearson.csv') #8kgene

# CS11

In [38]:
cyto = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS11f_CytO.h5ad')
spla = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS11f_splatter.h5ad')
simp = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS11f_splatter_simple.h5ad')
scde = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS11f_scDesign3.h5ad')
scva = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/simulated_matrix/h5ad_format/CS11f_scVAEDer.h5ad')
real2 = sc.read_h5ad('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/data_standard/CS11.h5ad')

In [39]:
count_matrix = mmread('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS10_CS12_2CS11_counts.mtx').T
barcodes = pd.read_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS10_CS12_2CS11_cell.csv', index_col = 0)
gene_ids = pd.read_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/source_matrix/CS10_CS12_2CS11_gene.csv', index_col = 0)
real1 = an.AnnData(X = count_matrix.toarray(),
                   obs = barcodes,
                   var = gene_ids)

In [41]:
sc.pp.normalize_total(real1, target_sum=1e4)
sc.pp.log1p(real1)

In [42]:
real = sc.concat([real1,real2],
                      join = 'outer',
                      fill_value = 0)

In [43]:
adata_int = sc.concat([cyto,spla,simp,scde,scva],
                      join = 'outer',
                      fill_value = 0)

adata_full = sc.concat([real,adata_int],
                       label = 'source',
                       keys = ['real','simulated'],
                       index_unique = '-')

In [44]:
sc.pp.highly_variable_genes(adata_full, n_top_genes = 8000)
adata_full = adata_full[:, adata_full.var.highly_variable]

In [45]:
adata = adata_full[adata_full.obs.time.isin(['CS11','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_real = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS11-CytOrigin','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_cyto = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS11-Splatter','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_spla = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS11-Splatter-Simple','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_simp = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS11-scDesign3','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scde = compute_classwise_distances(centroids_pca)

adata = adata_full[adata_full.obs.time.isin(['CS11-scVAEDer','CS12','CS10'])]
sc.tl.pca(adata, svd_solver='arpack')
sc.external.pp.bbknn(adata, batch_key='source')
sc.tl.umap(adata)
centroids_pca = calculate_trimmed_means(
                adata.obsm["X_umap"],
                adata.obs["time"],
                trim_proportion=0.05,
            )
pca_pairdist_scva = compute_classwise_distances(centroids_pca)



In [48]:
pca_pairdist_cyto.index = ['CS10','CS11','CS12']
pca_pairdist_cyto.columns = ['CS10','CS11','CS12']

pca_pairdist_scde.index = ['CS10','CS11','CS12']
pca_pairdist_scde.columns = ['CS10','CS11','CS12']

pca_pairdist_scva.index = ['CS10','CS11','CS12']
pca_pairdist_scva.columns = ['CS10','CS11','CS12']

pca_pairdist_spla.index = ['CS10','CS11','CS12']
pca_pairdist_spla.columns = ['CS10','CS11','CS12']

pca_pairdist_simp.index = ['CS10','CS11','CS12']
pca_pairdist_simp.columns = ['CS10','CS11','CS12']

In [49]:
index_pearson_cyto = corr_diff(pca_pairdist_real, pca_pairdist_cyto)
index_pearson_scva = corr_diff(pca_pairdist_real, pca_pairdist_scva)
index_pearson_scde = corr_diff(pca_pairdist_real, pca_pairdist_scde)
index_pearson_spla = corr_diff(pca_pairdist_real, pca_pairdist_spla)
index_pearson_simp = corr_diff(pca_pairdist_real, pca_pairdist_simp)

In [51]:
index_pearson = pd.concat([index_pearson_cyto,index_pearson_scva,index_pearson_scde,index_pearson_spla,index_pearson_simp], axis = 1)
index_pearson.columns = ['CytOrigin','scVAEDer','scDesign3','Splatter','Splatter-Simple']
index_pearson.to_csv('/slurm/home/yrd/liaolab/caohaoxue/embryo_work/benchmark/CS11_index_pearson.csv') #8kgene