In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot

from copy import deepcopy

from time import time

from math import ceil
from scipy.stats import spearmanr, gamma, poisson

from anndata import AnnData, read_h5ad
import scanpy as sc
from scanpy import read
import pandas as pd
from scipy.io import mmread
from sciPENN.sciPENN_API import sciPENN_API

In [2]:
adata_protein = sc.read_h5ad('../Data/pbmc/pbmc_protein.h5ad')
adata_gene = sc.read_h5ad('../Data/pbmc/pbmc_gene.h5ad')

In [3]:
adata_malt_gene = sc.read_10x_h5("../Data/malt_10k_protein_v3_filtered_feature_bc_matrix.h5")

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [4]:
adata_malt = sc.read("../Data/filtered_feature_bc_matrix/matrix.mtx").T
malt_features =  pd.read_csv("../Data/filtered_feature_bc_matrix/features.tsv", sep="\t", header=None)

In [5]:
adata_malt.var["feature_type"] = list(malt_features[2])
adata_malt.obs_names = adata_malt_gene.obs_names
adata_malt.var['protein_names'] = list(malt_features[0])
adata_malt.var_names = list(malt_features[0])

In [6]:
adata_malt_protein = adata_malt[:,adata_malt.var['feature_type'] == 'Antibody Capture']

In [7]:
adata_malt_gene.var_names_make_unique()

In [8]:
adata_gene_test = adata_malt_gene.copy()

In [9]:
adata_protein_test = adata_malt_protein.copy()
adata_protein_test.obs['sample'] = [1]*8412
adata_protein_test

AnnData object with n_obs × n_vars = 8412 × 17
    obs: 'sample'
    var: 'feature_type', 'protein_names'

In [10]:
ref = set(adata_protein_test.var.index)

prots = []
for x in adata_protein.var.index:
    if x in ref:
        prots.append(x)
        
prots

['CD19',
 'CD45RA',
 'CD8a',
 'CD14',
 'CD25',
 'CD45RO',
 'TIGIT',
 'CD127',
 'CD15',
 'CD16']

In [11]:
adata_protein_test[:, prots].X.mean()/adata_protein[:, prots].X.mean()

4.0437083

In [12]:
common_genes = np.intersect1d(adata_gene.var.index, adata_gene_test.var.index)
common_proteins = np.intersect1d(adata_protein.var.index, adata_protein_test.var.index)

In [13]:
sciPENN = sciPENN_API([adata_gene], [adata_protein], adata_gene_test, train_batchkeys = ['donor'])

Searching for GPU
GPU detected, using GPU

QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


... storing 'orig.ident' as categorical
... storing 'lane' as categorical
... storing 'donor' as categorical
... storing 'time' as categorical
... storing 'celltype.l1' as categorical
... storing 'celltype.l2' as categorical
... storing 'celltype.l3' as categorical
... storing 'Phase' as categorical
... storing 'batch' as categorical
... storing 'Dataset' as categorical
... storing 'feature_types-1' as categorical
... storing 'genome-1' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)



Normalizing Gene Training Data by Batch


100%|██████████| 8/8 [00:05<00:00,  1.51it/s]



Normalizing Protein Training Data by Batch


100%|██████████| 8/8 [00:02<00:00,  3.16it/s]



Normalizing Gene Testing Data by Batch


100%|██████████| 1/1 [00:00<00:00,  5.27it/s]


In [14]:
start = time()
sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 6, 
             decay_step = 0.1, lr = 10**(-3), weights_dir = "weights_dir/pbmc_to_malt")
imputed_test = sciPENN.predict()
time() - start

Epoch 0 prediction loss = 1.393
Epoch 1 prediction loss = 0.883
Epoch 2 prediction loss = 0.874
Epoch 3 prediction loss = 0.873
Epoch 4 prediction loss = 0.870
Epoch 5 prediction loss = 0.868
Epoch 6 prediction loss = 0.867
Epoch 7 prediction loss = 0.867
Epoch 8 prediction loss = 0.864
Epoch 9 prediction loss = 0.864
Epoch 10 prediction loss = 0.865
Decaying loss to 0.0001
Epoch 11 prediction loss = 0.854
Epoch 12 prediction loss = 0.854
Epoch 13 prediction loss = 0.853
Epoch 14 prediction loss = 0.853
Epoch 15 prediction loss = 0.854
Epoch 16 prediction loss = 0.853
Decaying loss to 1e-05
Epoch 17 prediction loss = 0.852
Epoch 18 prediction loss = 0.852
Epoch 19 prediction loss = 0.852
Epoch 20 prediction loss = 0.852
Epoch 21 prediction loss = 0.851
Epoch 22 prediction loss = 0.854
Decaying loss to 1.0000000000000002e-06
Epoch 23 prediction loss = 0.852


517.3066282272339

In [15]:
embedding = sciPENN.embed()
embedding.write("scipenn_maltembedding.h5ad")

... storing 'batch' as categorical


In [16]:
def corr2_coeff(A, B, pearson = True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))
        
        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]
    
    else:
        corrs = [0.] * A.shape[0]
        
        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]
            
        return corrs

In [17]:
adata_protein_test.X = adata_protein_test.X.toarray()
adata_protein_test.layers["raw"] = adata_protein_test.X

adata_protein_test = adata_protein_test[imputed_test.obs.index]

sc.pp.normalize_total(adata_protein_test)
sc.pp.log1p(adata_protein_test)
sc.pp.filter_genes(adata_protein_test, min_counts = 1)

common_proteins = np.intersect1d(imputed_test.var.index, adata_protein_test.var.index)

adata_protein_test = adata_protein_test[:, common_proteins]
adata_protein_test.layers['imputed'] = imputed_test[:, common_proteins].X
adata_protein_test.layers.update(imputed_test[:, common_proteins].layers)

patients = np.unique(adata_protein_test.obs['sample'].values)

for patient in patients:
    indices = [x == patient for x in adata_protein_test.obs['sample']]
    sub_adata = adata_protein_test[indices]

    sc.pp.scale(sub_adata)
    adata_protein_test[indices] = sub_adata.X

  view_to_actual(adata)
  view_to_actual(adata)


In [18]:
corrs = corr2_coeff(adata_protein_test.layers["imputed"].T, adata_protein_test.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = adata_protein_test.var.index
corrs = corrs.dropna()
corrs

Unnamed: 0,0
CD127,0.524693
CD14,0.265639
CD15,0.447974
CD16,0.372127
CD19,0.623805
CD25,0.187476
CD45RA,0.698331
CD45RO,0.721276
CD8a,0.731953
TIGIT,0.474912


In [19]:
corrs.mean()

0    0.504819
dtype: float32

In [20]:
corrs.to_csv('corrs_results/scipenn_malt.csv')

In [21]:
sq = lambda x, y: (x - y)**2

sqs = sq(adata_protein_test.layers["imputed"], adata_protein_test.X).mean(axis = 0)
sqs = pd.DataFrame(sqs)
sqs.index = adata_protein_test.var.index
sqs

Unnamed: 0,0
CD127,0.773271
CD14,1.04364
CD15,0.815751
CD16,0.885526
CD19,0.64997
CD25,1.299394
CD45RA,0.513036
CD45RO,0.529429
CD8a,0.466745
TIGIT,0.799552


In [22]:
sqs.mean()

0    0.777631
dtype: float32

In [23]:
sqs.to_csv('mse_results/scipenn_malt.csv')

In [24]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q75'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 50% PI: 0.395


In [25]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q90'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 80% PI: 0.672


In [26]:
adata_protein_test.write("scipenn_maltfeatures.h5ad")

... storing 'feature_type' as categorical
