In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot
import os
from copy import deepcopy

from time import time

from math import ceil
from scipy.stats import spearmanr, gamma, poisson, norm
from scipy.sparse import csc_matrix

from anndata import AnnData, read_h5ad
import scanpy as sc
from scanpy import read
import pandas as pd
import seaborn as sns

from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.cuda import is_available

from sciPENN.sciPENN_API import sciPENN_API

In [2]:
adata_covid19 = read_h5ad("../Data/immunodeficiency_covid_PMBCs.h5ad")

In [3]:
back_to_counts = np.round(2**adata_covid19.X.toarray().astype('float32') - 1)
adata_covid19.X = csc_matrix(back_to_counts)
back_to_counts = None

In [4]:
adata_gene_test = adata_covid19[:, adata_covid19.var["feature_types"] == "Gene Expression"].copy()
adata_protein_test = adata_covid19[:, adata_covid19.var["feature_types"] == "Antibody Capture"].copy()
del adata_covid19

In [5]:
adata_covid19 = read_h5ad('../Covid_Combined_SCE_raw.h5ad')

In [6]:
adata_covid19.X = adata_covid19.X.astype('float32')
adata_gene_train = adata_covid19[:, adata_covid19.var["feature_types"] == "Gene Expression"].copy()
adata_protein_train = adata_covid19[:, adata_covid19.var["feature_types"] == "Antibody Capture"].copy()
del adata_covid19

In [7]:
adata_protein_train.var.index = [x[3:] for x in adata_protein_train.var.index]
adata_protein_test.var.index = [x[3:] for x in adata_protein_test.var.index]

In [8]:
imputed_match, train_names = pd.read_csv("../Data/matched_proteins.csv", header = None).to_numpy().T

mapping = {key: value for key, value in zip(train_names, imputed_match)}

tmp = adata_protein_test.var.index.values

for i in range(len(tmp)):
    if tmp[i] in mapping:
        tmp[i] = mapping[tmp[i]]
        
adata_protein_test.var.index = tmp

In [9]:
train_proteins = adata_protein_train.var.index
test_proteins = adata_protein_test.var.index

all_proteins = np.intersect1d(train_proteins, test_proteins)
all_proteins

array(['B7-H4', 'BAFF', 'BAFFR', 'BTLA', 'CCR3', 'CCR4', 'CCR5', 'CCR6',
       'CCR7', 'CD101', 'CD123', 'CD14', 'CD141', 'CD15', 'CD158',
       'CD158b', 'CD16', 'CD161', 'CD163', 'CD19', 'CD1C', 'CD1a', 'CD1d',
       'CD2', 'CD20', 'CD209', 'CD21', 'CD22', 'CD226', 'CD235ab', 'CD24',
       'CD244', 'CD25', 'CD27', 'CD274', 'CD28', 'CD3', 'CD303', 'CD304',
       'CD33', 'CD34', 'CD36', 'CD38', 'CD4', 'CD40', 'CD44', 'CD45',
       'CD45RA', 'CD45RO', 'CD47', 'CD5', 'CD52', 'CD56', 'CD58', 'CD62L',
       'CD64', 'CD69', 'CD7', 'CD70', 'CD71', 'CD79b', 'CD8', 'CD80',
       'CD81', 'CD82', 'CD83', 'CD86', 'CD96', 'CD99', 'CLEC12A',
       'CLEC9A', 'CTLA4', 'CX3CR1', 'CXCR3', 'CXCR4', 'CXCR5', 'DR3',
       'FAS', 'FCRL4', 'FCRL5', 'FcERIa', 'HLA-ABC', 'HLA-A_2', 'HLA-DR',
       'HLA-F', 'IL2RB', 'IL4R', 'IL7R', 'IgA', 'IgD', 'IgM', 'KIR2DL5A',
       'KIR3DL1', 'LAG3', 'LAMP1', 'Mouse IgG1_K_Iso',
       'Mouse_IgG2a_K_Iso', 'Mouse_IgG2b_K_Iso', 'NECTIN2', 'NLRP2',
       'PD1',

In [10]:
np.random.seed(123)
common = np.random.choice(all_proteins, len(all_proteins)//3, False)
proteins2 = np.setdiff1d(all_proteins, common)
proteins1 = np.random.choice(proteins2, len(proteins2)//2, False)
proteins2 = np.setdiff1d(proteins2, proteins1)

testset1_truth_ = adata_protein_train[:, proteins2].copy()
testset2_truth_ = adata_protein_test[:, proteins1].copy()

adata_protein_train = adata_protein_train[:, list(proteins1) + list(common)].copy()
adata_protein_test = adata_protein_test[:, list(proteins2) + list(common)].copy()

In [11]:
start = time()
sciPENN = sciPENN_API([adata_gene_train, adata_gene_test], [adata_protein_train, adata_protein_test],
                    train_batchkeys = ['sample_id', 'Donor Id'])

Searching for GPU
GPU detected, using GPU

QC Filtering Training Cells

QC Filtering Training Genes

Normalizing Training Cells

Log-Normalizing Training Data





Finding HVGs


... storing 'sample_id' as categorical
... storing 'full_clustering' as categorical
... storing 'initial_clustering' as categorical
... storing 'Resample' as categorical
... storing 'Collection_Day' as categorical
... storing 'Sex' as categorical
... storing 'Age_interval' as categorical
... storing 'Swab_result' as categorical
... storing 'Status' as categorical
... storing 'Smoker' as categorical
... storing 'Status_on_day_collection' as categorical
... storing 'Status_on_day_collection_summary' as categorical
... storing 'Site' as categorical
... storing 'time_after_LPS' as categorical
... storing 'Woest_Clinical_Status' as categorical
... storing 'Outcome' as categorical
... storing 'patient_id' as categorical
... storing 'Days_from_onset' as categorical
... storing 'batch' as categorical
... storing 'Dataset' as categorical
... storing 'PreCondition' as categorical
... storing 'Hospitalized' as categorical
... storing 'Disease_classification' as categorical
... storing 'Deconvolve


Normalizing Gene Training Data by Batch


100%|██████████| 154/154 [00:37<00:00,  4.13it/s]



Normalizing Protein Training Data by Batch


100%|██████████| 143/143 [00:33<00:00,  4.32it/s]
100%|██████████| 11/11 [00:20<00:00,  1.85s/it]


In [12]:
sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 6, 
             decay_step = 0.1, lr = 10**(-3), weights_dir = "weights_dir/covid_to_covidI2")

time() - start

684.2761077880859

In [13]:
imputed_test = sciPENN.impute()

In [14]:
embedding = sciPENN.embed()
embedding.write("scipenn_covidintegrateembedding.h5ad")

... storing 'sample_id' as categorical
... storing 'full_clustering' as categorical
... storing 'initial_clustering' as categorical
... storing 'Resample' as categorical
... storing 'Collection_Day' as categorical
... storing 'Sex' as categorical
... storing 'Age_interval' as categorical
... storing 'Swab_result' as categorical
... storing 'Status' as categorical
... storing 'Smoker' as categorical
... storing 'Status_on_day_collection' as categorical
... storing 'Status_on_day_collection_summary' as categorical
... storing 'Site' as categorical
... storing 'time_after_LPS' as categorical
... storing 'Woest_Clinical_Status' as categorical
... storing 'Outcome' as categorical
... storing 'patient_id' as categorical
... storing 'Days_from_onset' as categorical
... storing 'batch' as categorical
... storing 'Dataset' as categorical
... storing 'PreCondition' as categorical
... storing 'Hospitalized' as categorical
... storing 'Disease_classification' as categorical
... storing 'Deconvolve

In [15]:
test_pro2 = imputed_test.var.index[[x and (not y) for x, y in zip(imputed_test.var['Dataset 1'], imputed_test.var['Dataset 2'])]]
test_set2imputed = imputed_test[imputed_test.obs['Dataset'] == 'Dataset 2', test_pro2]

test_pro1 = imputed_test.var.index[[x and (not y) for x, y in zip(imputed_test.var['Dataset 2'], imputed_test.var['Dataset 1'])]]
test_set1imputed = imputed_test[imputed_test.obs['Dataset'] == 'Dataset 1', test_pro1]

In [16]:
test_set1truth = testset1_truth_.copy()
test_set2truth = testset2_truth_.copy()

test_set1truth.obs.index = [x + "-0" for x in test_set1truth.obs.index]
test_set1truth = test_set1truth[test_set1imputed.obs.index, test_pro1].copy()

test_set2truth.obs.index = [x + "-1" for x in test_set2truth.obs.index]
test_set2truth = test_set2truth[test_set2imputed.obs.index, test_pro2].copy()

In [17]:
def corr2_coeff(A, B, pearson = True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))
        
        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]
    
    else:
        corrs = [0.] * A.shape[0]
        
        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]
            
        return corrs

In [18]:
"""Get test data"""

tmp = test_set1truth.X.toarray()
test_set1truth.X = tmp.copy()
test_set1truth.layers["raw"] = test_set1truth.X

sc.pp.normalize_total(test_set1truth)
sc.pp.log1p(test_set1truth)

test_set1truth.layers['imputed'] = test_set1imputed.X
test_set1truth.layers.update(test_set1imputed.layers)

patients = np.unique(test_set1truth.obs['sample_id'].values)

for patient in patients:
    indices = [x == patient for x in test_set1truth.obs['sample_id']]
    sub_adata = test_set1truth[indices]

    sc.pp.scale(sub_adata)
    test_set1truth[indices] = sub_adata.X

  view_to_actual(adata)


In [19]:
"""Compute correlation across patients"""

corrs = corr2_coeff(test_set1truth.layers["imputed"].T, test_set1truth.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = test_set1truth.var.index
corrs = corrs.dropna()

In [20]:
testset2_truth_.obs_names_make_unique()

In [21]:
sq = lambda x, y: (x - y)**2

In [22]:
"""Compute correlations within patient"""

corrs_table = np.zeros((test_set1truth.shape[1], len(np.unique(test_set1truth.obs["sample_id"]))))
sq_table = corrs_table.copy()

i = 0
for patient in np.unique(test_set1truth.obs["sample_id"]):
    truth = test_set1truth[test_set1truth.obs["sample_id"] == patient].X.copy()
    imputed = test_set1truth.layers["imputed"][test_set1truth.obs["sample_id"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis = 0)
    i += 1

if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0
    
corrs_table = pd.DataFrame(corrs_table)
sq_table = pd.DataFrame(sq_table)
corrs_table.index, corrs_table.columns = test_set1truth.var.index, np.unique(test_set1truth.obs["sample_id"])
sq_table.index, sq_table.columns = test_set1truth.var.index, np.unique(test_set1truth.obs["sample_id"])

  if sys.path[0] == '':


In [23]:
corrs_table.mean(axis = 0)

AP1              0.325986
AP10             0.263195
AP11             0.431721
AP12             0.321762
AP2              0.399153
                   ...   
newcastle21v2    0.205912
newcastle49      0.172687
newcastle59      0.200463
newcastle65      0.214894
newcastle74      0.207301
Length: 143, dtype: float64

In [24]:
corrs_table.mean().mean()

0.26130389046572183

In [25]:
corrs_table.to_csv('corrs_results/scipenn_covidintegratebig.csv')

In [26]:
sq_table.mean(axis = 0)

AP1              0.849597
AP10             0.903663
AP11             0.758820
AP12             0.834841
AP2              0.824120
                   ...   
newcastle21v2    0.941082
newcastle49      0.977556
newcastle59      0.951111
newcastle65      0.926905
newcastle74      0.940298
Length: 143, dtype: float64

In [27]:
sq_table.mean().mean()

0.8716399209566646

In [28]:
sq_table.to_csv('mse_results/scipenn_covidintegratebig.csv')

In [29]:
r95 = (test_set1truth.X < test_set1truth.layers['q75'])
l95 = (test_set1truth.X > test_set1truth.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 50% PI: 0.458


In [30]:
r95 = (test_set1truth.X < test_set1truth.layers['q90'])
l95 = (test_set1truth.X > test_set1truth.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 80% PI: 0.738


In [31]:
test_set1truth.write("scipenn_covidintegratebig.h5ad")

In [32]:
"""Get test data"""

tmp = test_set2truth.X.toarray()
test_set2truth.X = tmp.copy()
test_set2truth.layers["raw"] = test_set2truth.X

sc.pp.normalize_total(test_set2truth)
sc.pp.log1p(test_set2truth)

test_set2truth.layers['imputed'] = test_set2imputed.X
test_set2truth.layers.update(test_set2imputed.layers)

patients = np.unique(test_set2truth.obs['Donor Id'].values)

for patient in patients:
    indices = [x == patient for x in test_set2truth.obs['Donor Id']]
    sub_adata = test_set2truth[indices]

    sc.pp.scale(sub_adata)
    test_set2truth[indices] = sub_adata.X

  view_to_actual(adata)


In [33]:
"""Compute correlation across patients"""

corrs = corr2_coeff(test_set2truth.layers["imputed"].T, test_set2truth.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = test_set2truth.var.index
corrs = corrs.dropna()

In [34]:
"""Compute correlations within patient"""

corrs_table = np.zeros((test_set2truth.shape[1], len(np.unique(test_set2truth.obs["Donor Id"]))))
sq_table = corrs_table.copy()

i = 0
for patient in np.unique(test_set2truth.obs["Donor Id"]):
    truth = test_set2truth[test_set2truth.obs["Donor Id"] == patient].X.copy()
    imputed = test_set2truth.layers["imputed"][test_set2truth.obs["Donor Id"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis = 0)
    
    i += 1

if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0
    
corrs_table = pd.DataFrame(corrs_table)
sq_table = pd.DataFrame(sq_table)
corrs_table.index, corrs_table.columns = test_set2truth.var.index, np.unique(test_set2truth.obs["Donor Id"])
sq_table.index, sq_table.columns = test_set2truth.var.index, np.unique(test_set2truth.obs["Donor Id"])

In [35]:
corrs_table.mean(axis = 0)

GWAS_1     0.411999
GWAS_10    0.298133
GWAS_11    0.427452
GWAS_12    0.401904
GWAS_13    0.409972
GWAS_2     0.462028
GWAS_3     0.513221
GWAS_4     0.344739
GWAS_5     0.384215
GWAS_8     0.421537
Unknown    0.447011
dtype: float64

In [36]:
corrs_table.mean().mean()

0.41111000069744286

In [37]:
corrs_table.to_csv('corrs_results/scipenn_covidintegratesmall.csv')

In [38]:
sq_table.mean(axis = 0)

GWAS_1     0.781069
GWAS_10    0.862081
GWAS_11    0.783717
GWAS_12    0.794259
GWAS_13    0.775505
GWAS_2     0.734097
GWAS_3     0.693189
GWAS_4     0.824599
GWAS_5     0.826605
GWAS_8     0.780201
Unknown    0.776375
dtype: float64

In [39]:
sq_table.mean().mean()

0.784699843761669

In [40]:
sq_table.to_csv('mse_results/scipenn_covidintegratesmall.csv')

In [41]:
r95 = (test_set2truth.X < test_set2truth.layers['q75'])
l95 = (test_set2truth.X > test_set2truth.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 50% PI: 0.498


In [42]:
r95 = (test_set2truth.X < test_set2truth.layers['q90'])
l95 = (test_set2truth.X > test_set2truth.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

Effective Coverage Probability for Nominal 80% PI: 0.801


In [43]:
test_set2truth.write("scipenn_covidintegratesmall.h5ad")