In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
import scvi
import scanpy as sc

Global seed set to 0


In [None]:
RNA = sc.read('/home/jupyter/mount/sanger_gdrive/YS/totalvi_review_1/YS_and_Liver_combined/YS_Liver_CITEseq_RNA_raw_modality_intersect_20220927.h5ad')
Protein = sc.read('/home/jupyter/mount/sanger_gdrive/YS/totalvi_review_1/YS_and_Liver_combined/YS_Liver_CITEseq_Protein_raw_modality_intersect_20220927.h5ad')

In [None]:
RNA = RNA[RNA.obs['Organ'].isin(['YS'])]
Protein = Protein[Protein.obs['Organ'].isin(['YS'])]

# Totalvi

In [None]:
adata1 = RNA.copy()

In [None]:
adata1.obsm['protein_expression'] = pd.DataFrame.sparse.from_spmatrix(Protein.X, index=Protein.obs.index, columns=Protein.var.index)

adata1.layers["counts"] = adata1.X.copy()
sc.pp.normalize_total(adata1, target_sum=1e4)
sc.pp.log1p(adata1)
adata1.raw = adata1

In [None]:
sc.pp.highly_variable_genes(
    adata1,
    n_top_genes=4000,
    #flavor="seurat",
    flavor="seurat_v3",
    batch_key="identifier",
    subset=True,
    layer="counts"
)

In [None]:
adata = adata1.copy()

# Add in organ as additional categorical covariate

In [None]:
scvi.model.TOTALVI.setup_anndata(
    adata,
    protein_expression_obsm_key="protein_expression",
    layer="counts",
    batch_key="identifier",
    #categorical_covariate_keys = ['Organ']
)

In [None]:
vae = scvi.model.TOTALVI(adata, latent_distribution="normal")

In [None]:
vae.train()

In [None]:
plt.plot(vae.history["elbo_train"], label="train")
plt.plot(vae.history["elbo_validation"], label="validation")
plt.title("Negative ELBO over training epochs")
#plt.ylim(1200, 1400)
plt.legend()

In [None]:
adata.obsm["X_totalVI"] = vae.get_latent_representation()

rna, protein = vae.get_normalized_expression(
    n_samples=25,
    return_mean=True,
    transform_batch=list(adata.obs['identifier'].unique())
)

adata.layers["denoised_rna"], adata.obsm["denoised_protein"] = rna, protein

adata.obsm["protein_foreground_prob"] = vae.get_protein_foreground_probability(
    n_samples=25,
    return_mean=True,
    transform_batch=list(adata.obs['identifier'].unique())
)
parsed_protein_names = [p.split("_")[0] for p in adata.obsm["protein_expression"].columns]
adata.obsm["protein_foreground_prob"].columns = parsed_protein_names

In [None]:
sc.pp.neighbors(adata, use_rep="X_totalVI")
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_labels_lvl2')

In [None]:
sc.pl.umap(adata, color='Organ')

In [None]:
sc.pl.umap(adata, color='identifier')

In [None]:
vae.save("./totalvi_model_run_YS_20220928/")

In [None]:
adata.obs.to_csv('./obs_for_totalvi_model_run_YS_20220928.csv')

In [None]:
np.savetxt("./latent_X_totalVI_for_totalvi_model_run_YS_20220928.csv", adata.obsm['X_totalVI'], delimiter=",")

In [None]:
adata.var.to_csv('./var_for_totalvi_model_run_YS_20220928.csv')

In [None]:
np.savetxt("./totalvi_umap_for_totalvi_model_run_YS_20220928.csv", adata.obsm['X_umap'], delimiter=",")

In [None]:
adata.obsm['denoised_protein'].to_csv('./denoised_protein_for_totalvi_model_run_YS_20220928.csv')

In [None]:
adata.obsm['protein_foreground_prob'].to_csv('./protein_foreground_prob_for_totalvi_model_run_YS_20220928.csv')

In [None]:
np.savetxt("./denoised_rna_for_totalvi_model_run_YS_20220928.csv", adata.layers['denoised_rna'], delimiter=",")