## Set path

In [1]:
import os
dataset_dir = os.path.join(os.getcwd(), 'datasets/')
outputs_dir = os.path.join(os.getcwd(), 'outputs/')
if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

save_dir = os.path.join(outputs_dir, "different samples/CITE-PBMC-Li-Group1toGroup2/TotalVI/")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

## Load necessary libraries

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scvi
import anndata

Global seed set to 0


## Load data

In [3]:
train_data = sc.read_h5ad(os.path.join(dataset_dir, "different samples/CITE-PBMC-Li/Group1.h5ad"))
test_data = sc.read_h5ad(os.path.join(dataset_dir, "different samples/CITE-PBMC-Li/Group2.h5ad"))
train_data, test_data

(AnnData object with n_obs × n_vars = 27329 × 20729
     obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT'
     uns: 'protein_name'
     obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap', 'protein_expression',
 AnnData object with n_obs × n_vars = 26035 × 20729
     obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT'
     uns: 'protein_name'
     obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap', 'protein_expression')

## Combine gene expression data from the training and test sets

In [4]:
train_rna_expression = pd.DataFrame(train_data.X.todense(), columns=train_data.var.index, index=train_data.obs.index)
test_rna_expression = pd.DataFrame(test_data.X.todense(), columns=test_data.var.index, index=test_data.obs.index)
rna_expression = pd.concat([train_rna_expression, test_rna_expression], axis=0)
rna_expression

Unnamed: 0,AL627309.1,AL669831.5,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1,AL645608.8,HES4,ISG15,...,AC092718.2,AC087742.1,TMEM98,ANGPT4,CFAP61,AC016588.1,FAM83E,Z82244.2,AP001468.1,AP001469.2
L1_AAACGAAAGGCCCAAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_AAAGAACCACCTCTAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_AAAGAACGTCGAATTC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_AAAGGATAGCTTCGTA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_AAAGGGCTCGTACACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E2L8_TTTGACTGTCAATGGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E2L8_TTTGACTGTCCCGCAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E2L8_TTTGATCAGATGTAGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E2L8_TTTGGTTTCGTCAACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Copy the column containing batch information within datasets to the *batch_index* column

In [5]:
cells = pd.concat([train_data.obs, test_data.obs],axis=0)
cells["batch_index"] = cells["donor"]
cells.index = range(cells.shape[0])
cells

Unnamed: 0,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,orig.ident,lane,donor,time,celltype.l1,celltype.l2,celltype.l3,Phase,nCount_SCT,nFeature_SCT,batch_index
0,8753.0,224,6866.0,2000,P4_0,L1,P4,0,Mono,CD14 Mono,CD14 Mono,G1,6092.0,1995,P4
1,6108.0,219,5079.0,1494,P4_0,L1,P4,0,B,B memory,B memory kappa,G1,5324.0,1489,P4
2,4242.0,212,3255.0,966,P4_0,L1,P4,0,other T,MAIT,MAIT,S,4815.0,985,P4
3,7982.0,216,7235.0,2241,P4_0,L1,P4,0,Mono,CD14 Mono,CD14 Mono,G1,6211.0,2239,P4
4,8531.0,217,6135.0,1933,P4_0,L1,P4,0,Mono,CD14 Mono,CD14 Mono,G1,5873.0,1932,P4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53359,5896.0,219,14229.0,3399,P8_0,E2L8,P8,0,Mono,CD14 Mono,CD14 Mono,G1,8419.0,2936,P8
53360,10907.0,219,10551.0,3023,P8_0,E2L8,P8,0,Mono,CD14 Mono,CD14 Mono,G1,8412.0,3005,P8
53361,3760.0,202,5481.0,1833,P8_0,E2L8,P8,0,CD4 T,CD4 TEM,CD4 TEM_1,G1,7061.0,1834,P8
53362,4557.0,205,9015.0,2320,P8_0,E2L8,P8,0,CD4 T,CD4 TEM,CD4 TEM_1,S,8178.0,2319,P8


## Combine protein expression data from the training and test sets

In [6]:
train_protein_expression = pd.DataFrame(train_data.obsm["protein_expression"].todense(), columns=train_data.uns["protein_name"], index=train_data.obs.index)
test_protein_expression = pd.DataFrame(test_data.obsm["protein_expression"].todense(), columns=test_data.uns["protein_name"], index=test_data.obs.index)
protein_expression = pd.concat([train_protein_expression,test_protein_expression],axis=0)
protein_expression

Unnamed: 0,CD39,Rat-IgG1-1,CD107a,CD62P,TCR-2,CD30,CD31,CD34,CD35,CD36,...,CD169,CD28,CD161,CD163,CD138-1,CD164,CD138-2,CD144,CD202b,CD11c
L1_AAACGAAAGGCCCAAA,34.0,15.0,37.0,357.0,14.0,4.0,360.0,24.0,722.0,12.0,...,5.0,32.0,4.0,14.0,9.0,2.0,18.0,19.0,23.0,70.0
L1_AAAGAACCACCTCTAC,64.0,12.0,26.0,94.0,6.0,3.0,38.0,8.0,940.0,1.0,...,1.0,29.0,6.0,4.0,5.0,48.0,9.0,8.0,12.0,21.0
L1_AAAGAACGTCGAATTC,3.0,5.0,16.0,62.0,22.0,5.0,61.0,6.0,64.0,1.0,...,2.0,42.0,25.0,5.0,5.0,1.0,7.0,5.0,8.0,7.0
L1_AAAGGATAGCTTCGTA,28.0,15.0,31.0,523.0,8.0,2.0,394.0,19.0,1128.0,12.0,...,1.0,26.0,3.0,11.0,9.0,2.0,6.0,12.0,14.0,50.0
L1_AAAGGGCTCGTACACA,50.0,16.0,35.0,161.0,19.0,2.0,411.0,8.0,781.0,6.0,...,4.0,24.0,5.0,7.0,10.0,2.0,11.0,8.0,13.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E2L8_TTTGACTGTCAATGGG,30.0,10.0,58.0,288.0,11.0,2.0,207.0,9.0,459.0,13.0,...,2.0,20.0,5.0,8.0,5.0,8.0,2.0,9.0,4.0,101.0
E2L8_TTTGACTGTCCCGCAA,52.0,6.0,50.0,504.0,11.0,0.0,354.0,12.0,1714.0,31.0,...,2.0,25.0,2.0,8.0,6.0,10.0,8.0,9.0,9.0,177.0
E2L8_TTTGATCAGATGTAGT,2.0,5.0,6.0,57.0,13.0,1.0,20.0,4.0,32.0,1.0,...,2.0,46.0,0.0,2.0,4.0,3.0,8.0,6.0,12.0,7.0
E2L8_TTTGGTTTCGTCAACA,1.0,4.0,6.0,80.0,12.0,0.0,28.0,1.0,41.0,0.0,...,1.0,33.0,2.0,1.0,3.0,4.0,5.0,2.0,6.0,8.0


## Save protein expression data in test set for evaluation

In [7]:
test_protein_expression.to_csv(os.path.join(save_dir, "test_raw_protein_expression.txt"), sep="\t")

## Convert the merged gene expression data and protein expression data to anndata format

In [8]:
data = anndata.AnnData(X=rna_expression.values, var=train_data.var, obs=cells)
data.obsm["protein_expression"] = protein_expression.values
data.uns["protein_name"] = train_data.uns["protein_name"]
data



AnnData object with n_obs × n_vars = 53364 × 20729
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'batch_index'
    uns: 'protein_name'
    obsm: 'protein_expression'

## Read batch and cell name information

In [9]:
batch = data.obs["batch_index"].values
cell_names = test_protein_expression.index
batch, cell_names

(array(['P4', 'P4', 'P4', ..., 'P8', 'P8', 'P8'], dtype=object),
 Index(['L1_AAACCCATCTGCGGAC', 'L1_AAACGAAAGTTACTCG', 'L1_AAACGCTAGGTCGTCC',
        'L1_AAAGAACGTATCCTCC', 'L1_AAAGAACTCGCTTAAG', 'L1_AAAGGATAGTTCGCAT',
        'L1_AAAGGATCAATTAGGA', 'L1_AAAGGATCACCGTGGT', 'L1_AAAGGGCTCCTCTTTC',
        'L1_AAAGGTATCTTCCAGC',
        ...
        'E2L8_TTTCATGTCCGGCAAC', 'E2L8_TTTCCTCCACCAATTG',
        'E2L8_TTTCCTCGTATGTCCA', 'E2L8_TTTCGATAGATAGTCA',
        'E2L8_TTTGACTGTAGTTACC', 'E2L8_TTTGACTGTCAATGGG',
        'E2L8_TTTGACTGTCCCGCAA', 'E2L8_TTTGATCAGATGTAGT',
        'E2L8_TTTGGTTTCGTCAACA', 'E2L8_TTTGTTGGTTAGTTCG'],
       dtype='object', length=26035))

## Mask protein expression data in test set

In [10]:
data.obsm["protein_expression"][train_protein_expression.shape[0]:, :] = np.zeros(test_protein_expression.shape)

## Select highly variable genes

In [11]:
sc.pp.highly_variable_genes(data, batch_key="batch_index", flavor="seurat_v3", n_top_genes=4000, subset=True)

## Convert to scvi object

In [12]:
scvi.model.TOTALVI.setup_anndata(data, batch_key="batch_index", protein_expression_obsm_key="protein_expression")
model = scvi.model.TOTALVI(data, latent_distribution="normal", n_layers_decoder=2)
model

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Found batches with missing protein expression                                                             
[34mINFO    [0m Computing empirical prior initialization for protein background.                                          






## Train model

In [13]:
model.train()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 150/150: 100%|███████████████████████████████████████████████████████████████████████| 150/150 [17:28<00:00,  8.62s/it, loss=602, v_num=1]

`Trainer.fit` stopped: `max_epochs=150` reached.


Epoch 150/150: 100%|███████████████████████████████████████████████████████████████████████| 150/150 [17:28<00:00,  6.99s/it, loss=602, v_num=1]


## Impute protein expression data in test
### The *transform_batch* parameter contains the batch information from the training dataset

In [14]:
_, protein_means = model.get_normalized_expression(n_samples=25, transform_batch=['P4', 'P5', 'P6', 'P7'], include_protein_background=True, 
                                                   sample_protein_mixing=False, return_mean=True)
predicted_protein = pd.DataFrame(protein_means.iloc[train_protein_expression.shape[0]:,:].values, index=test_data.obs.index, 
                                 columns=test_data.uns["protein_name"])
predicted_protein

Unnamed: 0,CD39,Rat-IgG1-1,CD107a,CD62P,TCR-2,CD30,CD31,CD34,CD35,CD36,...,CD169,CD28,CD161,CD163,CD138-1,CD164,CD138-2,CD144,CD202b,CD11c
L1_AAACCCATCTGCGGAC,3.217982,5.863663,9.637951,82.822777,15.601016,1.816032,30.650845,4.484689,54.967255,0.975970,...,1.527652,51.151249,1.688432,2.624852,4.319225,2.200038,5.452486,5.449646,5.818207,8.080776
L1_AAACGAAAGTTACTCG,3.083364,5.819772,9.515879,72.649200,17.688305,1.936408,29.147621,4.280104,83.723297,0.866827,...,1.542411,58.424171,5.302394,2.555518,4.083919,2.027368,5.511385,5.393757,5.930943,7.912839
L1_AAACGCTAGGTCGTCC,3.201144,5.773730,9.729861,78.907166,17.825184,1.990232,53.160816,4.628013,40.293346,0.871598,...,1.566572,20.297432,2.625561,2.652480,4.234827,1.882636,5.301095,5.418642,5.528088,7.936379
L1_AAAGAACGTATCCTCC,3.059830,5.973596,10.148655,85.112793,17.461294,2.115765,58.004238,4.656831,45.555054,0.997884,...,1.522672,33.279144,1.437754,2.721764,4.353410,1.855301,5.496924,5.161040,6.480829,7.905310
L1_AAAGAACTCGCTTAAG,3.243099,5.816129,10.336527,96.310860,19.872995,1.946483,31.975124,4.615089,56.014091,1.102726,...,1.555121,58.437973,3.460444,2.697168,3.923714,2.400779,5.536910,5.373223,6.142107,8.106462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E2L8_TTTGACTGTCAATGGG,26.614948,8.608111,32.375240,364.620300,10.338601,2.683698,254.997223,9.032499,673.731384,11.834036,...,8.514499,16.453428,1.965212,8.065777,6.753254,5.629142,9.667010,7.243454,8.965612,62.811989
E2L8_TTTGACTGTCCCGCAA,31.302197,8.379686,43.145729,545.026855,9.975262,2.738781,294.238251,10.667911,622.237610,14.211056,...,11.074026,15.872723,1.972059,8.625648,6.941046,7.401553,9.740988,7.463363,8.811541,76.052307
E2L8_TTTGATCAGATGTAGT,3.081809,5.914779,9.617463,78.453369,17.102219,1.874242,30.391136,4.215155,42.881012,0.838787,...,1.543136,52.348801,8.818273,2.657289,4.659856,1.855326,5.543586,5.953833,6.009220,7.912551
E2L8_TTTGGTTTCGTCAACA,3.117471,5.812196,10.144927,82.659950,18.376884,1.961650,29.179825,4.395793,57.019566,0.914505,...,1.517572,62.046104,6.286384,2.648449,4.627547,1.954712,5.709293,6.343831,6.386444,7.995493


## Save prediction and trained model

In [15]:
predicted_protein.to_csv(os.path.join(save_dir, "test_protein_prediction.txt"), sep="\t")
model.save(dir_path=save_dir, save_anndata=True, overwrite=True)