In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot
import os
from copy import deepcopy

from time import time

from math import ceil
from scipy.stats import spearmanr, gamma, poisson

from anndata import AnnData, read_h5ad
import scanpy as sc
from scanpy import read
import pandas as pd

from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.cuda import is_available

from sciPENN.sciPENN_API import sciPENN_API

In [3]:
"""Read in Raw Data"""

adata_gene = sc.read("../Data/pbmc/pbmc_gene.h5ad")
adata_protein = sc.read("../Data/pbmc/pbmc_protein.h5ad")

adata_gene_test = sc.read("../Data/H1N1/gene_data.mtx").T
adata_gene_test.var.index = pd.read_csv("../Data/H1N1/gene_names.txt", index_col = 0).iloc[:, 0]
adata_gene_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

sciPENN = sciPENN_API([adata_gene], [adata_protein], adata_gene_test, 
                    train_batchkeys = ['donor'], test_batchkey = 'sample')

Searching for GPU
GPU detected, using GPU

QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


... storing 'orig.ident' as categorical
... storing 'lane' as categorical
... storing 'donor' as categorical
... storing 'time' as categorical
... storing 'celltype.l1' as categorical
... storing 'celltype.l2' as categorical
... storing 'celltype.l3' as categorical
... storing 'Phase' as categorical
... storing 'batch' as categorical
... storing 'Dataset' as categorical
... storing 'barcode_check' as categorical
... storing 'tenx_lane' as categorical
... storing 'cohort' as categorical
... storing 'hash_maxID' as categorical
... storing 'hash_secondID' as categorical
... storing 'hto_classification' as categorical
... storing 'hto_classification_global' as categorical
... storing 'hash_ID' as categorical
... storing 'adjmfc.time' as categorical
... storing 'DMX_GLOBAL_BEST' as categorical
... storing 'DEMUXLET.BARCODE' as categorical
... storing 'sample' as categorical
... storing 'joint_classification_global' as categorical
... storing 'timepoint' as categorical
... storing 'K0' as ca


Normalizing Gene Training Data by Batch


100%|██████████| 8/8 [00:05<00:00,  1.44it/s]



Normalizing Protein Training Data by Batch


100%|██████████| 8/8 [00:02<00:00,  3.24it/s]



Normalizing Gene Testing Data by Batch


100%|██████████| 20/20 [00:01<00:00, 17.34it/s]


In [4]:
adata_protein_test = sc.read("../Data/H1N1/protein_data.mtx").T
adata_protein_test.var.index = [x[:len(x) - 5] for x in pd.read_csv("../Data/H1N1/protein_names.txt", index_col = 0).iloc[:,0]]
adata_protein_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

In [6]:
sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 6, 
             decay_step = 0.1, lr = 10**(-3), weights_dir = "weights_dir/pbmc_to_h1n1")

Epoch 0 prediction loss = 1.398
Epoch 1 prediction loss = 0.886
Epoch 2 prediction loss = 0.880
Epoch 3 prediction loss = 0.877
Epoch 4 prediction loss = 0.874
Epoch 5 prediction loss = 0.873
Epoch 6 prediction loss = 0.873
Epoch 7 prediction loss = 0.871
Epoch 8 prediction loss = 0.870
Epoch 9 prediction loss = 0.870
Epoch 10 prediction loss = 0.869
Epoch 11 prediction loss = 0.870
Epoch 12 prediction loss = 0.867
Epoch 13 prediction loss = 0.868
Decaying loss to 0.0001
Epoch 14 prediction loss = 0.858
Epoch 15 prediction loss = 0.857
Epoch 16 prediction loss = 0.857
Epoch 17 prediction loss = 0.856
Epoch 18 prediction loss = 0.857
Epoch 19 prediction loss = 0.857
Decaying loss to 1e-05
Epoch 20 prediction loss = 0.856
Epoch 21 prediction loss = 0.856
Epoch 22 prediction loss = 0.856
Epoch 23 prediction loss = 0.856
Epoch 24 prediction loss = 0.856
Epoch 25 prediction loss = 0.855
Decaying loss to 1.0000000000000002e-06
Epoch 26 prediction loss = 0.856


In [7]:
imputed_test = sciPENN.predict()

In [8]:
embedding = sciPENN.embed()
embedding.write("scipenn_pbmctoh1n1embedding.h5ad")

... storing 'orig.ident' as categorical
... storing 'batch' as categorical


In [9]:
"""Get test data"""

adata_protein_test = sc.read("../Data/H1N1/protein_data.mtx").T
adata_protein_test.var.index = [x[:len(x) - 5] for x in pd.read_csv("../Data/H1N1/protein_names.txt", index_col = 0).iloc[:,0]]
adata_protein_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

adata_protein_test.X = adata_protein_test.X.toarray()
adata_protein_test.layers["raw"] = adata_protein_test.X

adata_protein_test = adata_protein_test[imputed_test.obs.index]

sc.pp.normalize_total(adata_protein_test)
sc.pp.log1p(adata_protein_test)

common_proteins = np.intersect1d(imputed_test.var.index, adata_protein_test.var.index)

adata_protein_test = adata_protein_test[:, common_proteins]
adata_protein_test.layers['imputed'] = imputed_test[:, common_proteins].X
adata_protein_test.layers.update(imputed_test[:, common_proteins].layers)

patients = np.unique(adata_protein_test.obs['sample'].values)

for patient in patients:
    indices = [x == patient for x in adata_protein_test.obs['sample']]
    sub_adata = adata_protein_test[indices]

    sc.pp.scale(sub_adata)
    adata_protein_test[indices] = sub_adata.X

  view_to_actual(adata)
  view_to_actual(adata)


In [10]:
def corr2_coeff(A, B, pearson = True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))
        
        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]
    
    else:
        corrs = [0.] * A.shape[0]
        
        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]
            
        return corrs

In [11]:
"""Compute correlation across patients"""

corrs = corr2_coeff(adata_protein_test.layers["imputed"].T, adata_protein_test.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = adata_protein_test.var.index
corrs = corrs.dropna()

In [12]:
MSEs= ((adata_protein_test.X - adata_protein_test.layers["imputed"])**2).mean(axis = 0)**(1/2)

protein_table = pd.DataFrame(np.concatenate((corrs.to_numpy(), np.expand_dims(MSEs, axis = 1), adata_protein_test.layers["raw"].mean(axis = 0, keepdims = True).T), axis = 1), 
                             index = corrs.index, columns = ["Correlations", "RMSE", "Mean Expression"])

protein_table["Log-Mean Expression"] = np.log(protein_table["Mean Expression"])

In [13]:
sq = lambda x, y: (x - y)**2

In [14]:
"""Compute correlations within patient"""

corrs_table = np.zeros((adata_protein_test.shape[1], len(np.unique(adata_protein_test.obs["sample"]))))
sq_table = corrs_table.copy()

i = 0
for patient in np.unique(adata_protein_test.obs["sample"]):
    truth = adata_protein_test[adata_protein_test.obs["sample"] == patient].X.copy()
    imputed = adata_protein_test.layers["imputed"][adata_protein_test.obs["sample"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis = 0)
    i += 1

if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0

In [15]:
corrs_table = pd.DataFrame(corrs_table)
corrs_table.index, corrs_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["sample"])

sq_table = pd.DataFrame(sq_table)
sq_table.index, sq_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["sample"])

In [16]:
corrs_table.mean(axis = 0)

200_d0    0.551671
201_d0    0.522775
205_d0    0.523219
207_d0    0.513245
209_d0    0.534473
212_d0    0.523326
215_d0    0.544069
229_d0    0.507505
233_d0    0.507746
234_d0    0.552363
236_d0    0.507167
237_d0    0.518251
245_d0    0.520775
250_d0    0.520766
256_d0    0.556987
261_d0    0.518480
268_d0    0.517075
273_d0    0.483878
277_d0    0.533272
279_d0    0.523227
dtype: float64

In [17]:
corrs_table.mean().mean()

0.52401347896025

In [18]:
corrs_table.to_csv('corrs_results/scipenn_pbmctoh1n1.csv')

In [19]:
sq_table.mean(axis = 0)

200_d0    0.639866
201_d0    0.677758
205_d0    0.670409
207_d0    0.675455
209_d0    0.639448
212_d0    0.645429
215_d0    0.642354
229_d0    0.677762
233_d0    0.679205
234_d0    0.612601
236_d0    0.691243
237_d0    0.683056
245_d0    0.678042
250_d0    0.661990
256_d0    0.620139
261_d0    0.671863
268_d0    0.663950
273_d0    0.719429
277_d0    0.649014
279_d0    0.669926
dtype: float64

In [20]:
sq_table.mean().mean()

0.6634469570371054

In [21]:
sq_table.to_csv('mse_results/scipenn_pbmctoh1n1.csv')

In [22]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q75'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 50% PI: 0.473


In [23]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q90'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 80% PI: 0.776


In [24]:
adata_protein_test.write("scipenn_pbmctoh1n1features.h5ad")

... storing 'orig.ident' as categorical
... storing 'tenx_lane' as categorical
... storing 'cohort' as categorical
... storing 'hash_maxID' as categorical
... storing 'hash_secondID' as categorical
... storing 'hto_classification' as categorical
... storing 'hto_classification_global' as categorical
... storing 'hash_ID' as categorical
... storing 'adjmfc.time' as categorical
... storing 'DMX_GLOBAL_BEST' as categorical
... storing 'DEMUXLET.BARCODE' as categorical
... storing 'sample' as categorical
... storing 'joint_classification_global' as categorical
... storing 'timepoint' as categorical
... storing 'K0' as categorical
... storing 'K1' as categorical
... storing 'K2' as categorical
... storing 'K3' as categorical
