In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scanpy as sc
import muon as mu
import graphtools
import magic
from matplotlib.pyplot import rc_context
from matplotlib import gridspec
import os


from datashader.mpl_ext import dsshow
import datashader as ds

import matplotlib.colors
cmap = plt.cm.hsv
cmaplist = [cmap(i) for i in range(cmap.N)]
cmaplist = cmaplist[0:round(len(cmaplist)*0.7)]
cmaplist.reverse()
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('', cmaplist, cmap.N)

  from .autonotebook import tqdm as notebook_tqdm


# Process T cell counts matrix

In [7]:
adata = sc.read_10x_mtx('/data/srlab1/mcurtis/GSK/tcell_proliferation/TCellAnnotator/data/Sparks2023/T_fromSeurat_RNA_ADT_HTO/')


In [10]:
adata

AnnData object with n_obs × n_vars = 336739 × 28543
    var: 'gene_ids', 'feature_types'

In [None]:
meta = pd.read_csv('/data/srlab1/mcurtis/GSK/tcell_proliferation/TCellAnnotator/data/Sparks2023/T_fromSeurat_metadata.txt',
                   index_col = 0, sep = '\t')


In [19]:
adata.obs = meta

In [22]:
htos = ['HTO1', 'HTO2', 'HTO3']

In [93]:
adata.obs[htos] = pd.DataFrame(adata[:, htos].X.todense(), index = adata.obs.index, columns = htos)

In [97]:
adata.obs.head()

Unnamed: 0,alt.subject.id,visit,covid.diagnosis.start.date.to.sample.drawn,group,sex,age,race,ethnicity,flu.vax.count.10yr,vaccine,...,nFeature_HTO,nFeature_CITE,nFeature_SCT,percent.mt,percent.largest.gene,coarse.cell.type,cell.type,HTO1,HTO2,HTO3
Batch1_COVFLU_CITE_multi5P09_GCATGCGCAGCCTATA-1,Control,,,,,,,,,,...,2,106,1255,4.175894,2.214489,CD4,CD4_Naive,635.0,23.0,0.0
Batch1_COVFLU_CITE_multi5P02_TTGTAGGCAGTATCTG-1,Control,,,,,,,,,,...,3,131,1644,2.062801,2.887921,CD4,CD4_Naive,1612.0,73.0,3.0
Batch1_COVFLU_CITE_multi5P02_GATGCTACAGCTTAAC-1,Control,,,,,,,,,,...,3,131,2429,2.143354,2.477383,CD4,CD4_Naive,2549.0,153.0,4.0
Batch1_COVFLU_CITE_multi5P03_AGGGTGACATTAGGCT-1,Control,,,,,,,,,,...,2,121,1807,5.324459,2.59092,CD8,CD8_TEMRA,1519.0,41.0,0.0
Batch1_COVFLU_CITE_multi5P10_GAACCTACAACGCACC-1,Control,,,,,,,,,,...,3,114,930,11.634905,9.327984,CD4,CD4_Naive,1314.0,49.0,4.0


In [98]:
adata_filt = adata[:, [v for v in adata.var.index if v not in htos]]

In [99]:
adata_filt.var.index = [v.replace('PROT-', 'AB_') for v in adata_filt.var.index]

In [100]:
adata_filt.var['gene_ids'] = adata_filt.var.index

  """Entry point for launching an IPython kernel.


In [101]:
adata_filt.var.loc[[v for v in adata_filt.var.index if 'AB_' in v], 
               'feature_types'] = 'Antibody Capture'

In [102]:
adata_filt.var_names_make_unique()

In [103]:
adata_filt

AnnData object with n_obs × n_vars = 336739 × 28540
    obs: 'alt.subject.id', 'visit', 'covid.diagnosis.start.date.to.sample.drawn', 'group', 'sex', 'age', 'race', 'ethnicity', 'flu.vax.count.10yr', 'vaccine', 'vaccine.side.effects', 'long.covid.symptoms', 'Batch', 'nCount_RNA', 'nCount_HTO', 'nCount_CITE', 'nCount_RNA_largest.gene', 'nCount_SCT', 'nFeature_RNA', 'nFeature_HTO', 'nFeature_CITE', 'nFeature_SCT', 'percent.mt', 'percent.largest.gene', 'coarse.cell.type', 'cell.type', 'HTO1', 'HTO2', 'HTO3'
    var: 'gene_ids', 'feature_types'

In [122]:
sc.write(filename = '/data/srlab1/mcurtis/GSK/tcell_proliferation/TCellAnnotator/data/Sparks2023/T_fromSeurat.h5ad',
        adata = adata_filt)

In [124]:
adata_orig = adata

In [None]:
!pwd

In [153]:
# usage_fn = '/data/srlab1/mcurtis/GSK/tcell_proliferation/Covid_COMBAT/cnmf_output/T_Script_learnHarmonyRNA_RefitBoth/T_Script_learnHarmonyRNA_RefitBoth.usages.k_44.dt_0_2.consensus.labeled.txt'
# usage = pd.read_csv(usage_fn, sep='\t', index_col=0)
# usage = usage.div(usage.sum(axis=1), axis=0)
# usage.head()

tcat_dir = '/data/srlab1/mcurtis/GSK/tcell_proliferation/TCellAnnotator/cache/'
tcat_fn = ''.join([tcat_dir, 'rfusagesUnNorm_QueryCovidSparks.Ref_metaprgs_filtsingle.20230515.txt'])
tcat_fn

tcat_usage = pd.read_csv(tcat_fn, sep='\t', index_col=0)
tcat_usage = tcat_usage.div(tcat_usage.sum(axis=1), axis=0)
tcat_usage.head()

Unnamed: 0,Metallothionein,KLRC2/GNLY_NK-like_gdT_gdT/Tk,HeatShock_HeatShock_NME1/FABP5_NME1/FABP5,Pan-Tissue:?METRNL/CREM/LDLRAD4_Single,Doublet_DC_HLA,Cytoskeleton,Healthy (Hao):?KLRC2/GNLY_Single,HeatShock,CD4_CM-2_PTPN13/TNFRSF4_Th17,EBI3,...,Cytotoxic_TE,?Doublet_Myeloid_?GutMyeloid2_MMP8_Doublet_Myeloid,Healthy (Hao):?NK-Like_Single,CD8_CD160high_CD8_class2_HLA,CRTH2/GATA3_KRT1/CCR4_Th2,YPEL5/CREM,TBRU:ICOS/CCR4_Single,AB_CD86/TIMD4_dnT1/CD38,Doublet_Platelet,Mito
Batch1_COVFLU_CITE_multi5P09_GCATGCGCAGCCTATA-1,0.0,0.0,0.0,0.0,0.00051,0.135716,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004817,0.010046,0.0,0.083679,0.020967,0.0,0.000443,0.077506
Batch1_COVFLU_CITE_multi5P02_TTGTAGGCAGTATCTG-1,0.003602,0.0,0.0,0.0,0.017794,0.0,0.0,0.016624,0.150139,0.0,...,0.0,0.0204,0.0,0.0,0.0,0.033323,0.152673,0.011206,0.0,0.0
Batch1_COVFLU_CITE_multi5P02_GATGCTACAGCTTAAC-1,0.0,0.0,0.0,0.000409,0.0,0.060651,0.0,0.0,0.04263,0.0,...,0.0,0.003551,0.0,0.016123,0.0,0.139014,0.0,0.0,0.000955,0.012281
Batch1_COVFLU_CITE_multi5P03_AGGGTGACATTAGGCT-1,0.022641,0.069716,0.0,0.0,0.0,0.035203,0.053163,0.007829,0.0,0.0,...,0.065356,0.0,0.050786,0.553508,0.0,0.0,0.0,0.0,0.0,0.010539
Batch1_COVFLU_CITE_multi5P10_GAACCTACAACGCACC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079772


Get which meta-programs based on cluster_df

In [154]:
clus_df = pd.read_csv(tcat_dir + 'cluster_groups.pairwiseCorr.Z_HVGunion.FilteredSingletons.20230515.txt', 
                      sep = '\t', index_col = 0)
dataset = 'Healthy (Hao)'

### Get meta-programs where there is a corresponding dataset-specific program
dataset_prgs = pd.DataFrame([(metaprg, ''.join([subprg for subprg in clus_df.loc[(metaprg), ].dropna() 
                                                if dataset in subprg]).replace(dataset + ':', '')) 
              for metaprg in clus_df.index], columns = ['meta_prg', 'dataset_prg'])

dataset_prgs = dataset_prgs[dataset_prgs['dataset_prg']!='']
dataset_prgs.index = dataset_prgs['dataset_prg']

In [155]:
# Subset to just Hao programs for comparison, rename columns to match Hao
tcat_filt = tcat_usage.copy()
tcat_filt = tcat_filt[dataset_prgs['meta_prg']]
tcat_filt.columns = dataset_prgs['dataset_prg']


In [184]:

dataset_prgs

Unnamed: 0_level_0,meta_prg,dataset_prg
dataset_prg,Unnamed: 1_level_1,Unnamed: 2_level_1
HLA,Doublet_DC_HLA,HLA
Cytoskeleton,Cytoskeleton,Cytoskeleton
?KLRC2/GNLY,Healthy (Hao):?KLRC2/GNLY_Single,?KLRC2/GNLY
Th17,CD4_CM-2_PTPN13/TNFRSF4_Th17,Th17
?BCL2/CDK6,?BCL2/CDK6_BCL2/CDK6,?BCL2/CDK6
?SOX4/MME,?SOX4/MME_SOX4/MME,?SOX4/MME
Doublet_Plasmablast,Healthy (Hao):Doublet_Plasmablast_Single,Doublet_Plasmablast
dnT/IL10+CD38+,AMP-RA_CD38_dnT_dnT/IL10+CD38+,dnT/IL10+CD38+
CD4_Naive,CD4_Naive_Ribo/T_Naive_T_Naive_Translation_Tra...,CD4_Naive
IEG,IEG_IEG1_KLF6/CXCR4,IEG


# Load PBMC counts matrix

In [6]:
adata = sc.read_10x_mtx('../../../Data/PerDataset/Sparks2023/PBMC_fromSeurat/')


In [8]:
adata

AnnData object with n_obs × n_vars = 632100 × 28543
    var: 'gene_ids', 'feature_types'

In [3]:
meta = pd.read_csv('../../../Data/PerDataset/Sparks2023/PBMC_fromSeurat_metadata.txt',
                   index_col = 0, sep = '\t')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
adata.var.head()

Unnamed: 0,gene_ids,feature_types
RP11-34P13.7,RP11-34P13.7,Gene Expression
AL627309.1,AL627309.1,Gene Expression
RP11-34P13.14,RP11-34P13.14,Gene Expression
RP11-34P13.9,RP11-34P13.9,Gene Expression
AP006222.2,AP006222.2,Gene Expression


In [10]:
adata.obs.head()

Batch1_COVFLU_CITE_multi5P12_ACTGATGGTTCAGCGC-1
Batch1_COVFLU_CITE_multi5P16_CCCAGTTTCGGCTTGG-1
Batch1_COVFLU_CITE_multi5P09_GCATGCGCAGCCTATA-1
Batch1_COVFLU_CITE_multi5P11_CATCCACAGCTCTCGG-1
Batch1_COVFLU_CITE_multi5P02_TTGTAGGCAGTATCTG-1


In [5]:
meta.shape

(632100, 26)

In [4]:
meta.head()

Unnamed: 0,alt.subject.id,visit,covid.diagnosis.start.date.to.sample.drawn,group,sex,age,race,ethnicity,flu.vax.count.10yr,vaccine,...,nCount_RNA_largest.gene,nCount_SCT,nFeature_RNA,nFeature_HTO,nFeature_CITE,nFeature_SCT,percent.mt,percent.largest.gene,coarse.cell.type,cell.type
Batch1_COVFLU_CITE_multi5P12_ACTGATGGTTCAGCGC-1,Control,,,,,,,,,,...,96,4584,1881,3,126,1881,3.149606,2.042988,Mono,Mono_Classical
Batch1_COVFLU_CITE_multi5P16_CCCAGTTTCGGCTTGG-1,Control,,,,,,,,,,...,96,4384,1882,3,127,1882,4.72387,2.19078,B,B_Naive
Batch1_COVFLU_CITE_multi5P09_GCATGCGCAGCCTATA-1,Control,,,,,,,,,,...,70,3973,1255,2,106,1255,4.175894,2.214489,CD4,CD4_Naive
Batch1_COVFLU_CITE_multi5P11_CATCCACAGCTCTCGG-1,Control,,,,,,,,,,...,25,3585,634,3,128,798,6.410256,2.003205,Mono,Mono_Classical
Batch1_COVFLU_CITE_multi5P02_TTGTAGGCAGTATCTG-1,Control,,,,,,,,,,...,126,4375,1644,3,131,1644,2.062801,2.887921,CD4,CD4_Naive


In [13]:
adata.obs = meta.loc[adata.obs.index, :]

In [14]:
adata.var.tail()

Unnamed: 0,gene_ids,feature_types
PROT-CD169,PROT-CD169,Gene Expression
PROT-S1probe,PROT-S1probe,Gene Expression
HTO1,HTO1,Gene Expression
HTO2,HTO2,Gene Expression
HTO3,HTO3,Gene Expression


In [15]:
htos = ['HTO1', 'HTO2', 'HTO3']

In [16]:
adata.obs[htos] = pd.DataFrame(adata[:, htos].X.todense(), index = adata.obs.index, columns = htos)

In [17]:
adata.obs.head()

Unnamed: 0,alt.subject.id,visit,covid.diagnosis.start.date.to.sample.drawn,group,sex,age,race,ethnicity,flu.vax.count.10yr,vaccine,...,nFeature_HTO,nFeature_CITE,nFeature_SCT,percent.mt,percent.largest.gene,coarse.cell.type,cell.type,HTO1,HTO2,HTO3
Batch1_COVFLU_CITE_multi5P12_ACTGATGGTTCAGCGC-1,Control,,,,,,,,,,...,3,126,1881,3.149606,2.042988,Mono,Mono_Classical,2760.0,98.0,2.0
Batch1_COVFLU_CITE_multi5P16_CCCAGTTTCGGCTTGG-1,Control,,,,,,,,,,...,3,127,1882,4.72387,2.19078,B,B_Naive,1886.0,119.0,1.0
Batch1_COVFLU_CITE_multi5P09_GCATGCGCAGCCTATA-1,Control,,,,,,,,,,...,2,106,1255,4.175894,2.214489,CD4,CD4_Naive,635.0,23.0,0.0
Batch1_COVFLU_CITE_multi5P11_CATCCACAGCTCTCGG-1,Control,,,,,,,,,,...,3,128,798,6.410256,2.003205,Mono,Mono_Classical,2337.0,68.0,1.0
Batch1_COVFLU_CITE_multi5P02_TTGTAGGCAGTATCTG-1,Control,,,,,,,,,,...,3,131,1644,2.062801,2.887921,CD4,CD4_Naive,1612.0,73.0,3.0


In [18]:
adata_filt = adata[:, [v for v in adata.var.index if v not in htos]]

In [19]:
adata_filt.var.index = [v.replace('PROT-', 'AB_') for v in adata_filt.var.index]

In [22]:
adata_filt.var['gene_ids'] = adata_filt.var.index

  """Entry point for launching an IPython kernel.


In [24]:
adata_filt.var.loc[[v for v in adata_filt.var.index if 'AB_' in v], 
               'feature_types'] = 'Antibody Capture'

In [25]:
adata_filt.var_names_make_unique()

In [26]:
adata_filt

AnnData object with n_obs × n_vars = 632100 × 28540
    obs: 'alt.subject.id', 'visit', 'covid.diagnosis.start.date.to.sample.drawn', 'group', 'sex', 'age', 'race', 'ethnicity', 'flu.vax.count.10yr', 'vaccine', 'vaccine.side.effects', 'long.covid.symptoms', 'Batch', 'nCount_RNA', 'nCount_HTO', 'nCount_CITE', 'nCount_RNA_largest.gene', 'nCount_SCT', 'nFeature_RNA', 'nFeature_HTO', 'nFeature_CITE', 'nFeature_SCT', 'percent.mt', 'percent.largest.gene', 'coarse.cell.type', 'cell.type', 'HTO1', 'HTO2', 'HTO3'
    var: 'gene_ids', 'feature_types'

In [27]:
adata_filt.var

Unnamed: 0,gene_ids,feature_types
RP11-34P13.7,RP11-34P13.7,Gene Expression
AL627309.1,AL627309.1,Gene Expression
RP11-34P13.14,RP11-34P13.14,Gene Expression
RP11-34P13.9,RP11-34P13.9,Gene Expression
AP006222.2,AP006222.2,Gene Expression
...,...,...
AB_CD57,AB_CD57,Antibody Capture
AB_CD303,AB_CD303,Antibody Capture
AB_CD226,AB_CD226,Antibody Capture
AB_CD169,AB_CD169,Antibody Capture


In [28]:
sc.write(filename = '../../../Data/PerDataset/Sparks2023/PBMC_fromSeurat.h5ad',
        adata = adata_filt)

# Output B cell counts matrix

In [2]:
mergefn = '../../../Data/PerDataset/Sparks2023/PBMC_fromSeurat.ADTfixed.h5ad'

In [3]:
adata = sc.read(mergefn)

In [4]:
adata

AnnData object with n_obs × n_vars = 632100 × 28540
    obs: 'alt.subject.id', 'visit', 'covid.diagnosis.start.date.to.sample.drawn', 'group', 'sex', 'age', 'race', 'ethnicity', 'flu.vax.count.10yr', 'vaccine', 'vaccine.side.effects', 'long.covid.symptoms', 'Batch', 'nCount_RNA', 'nCount_HTO', 'nCount_CITE', 'nCount_RNA_largest.gene', 'nCount_SCT', 'nFeature_RNA', 'nFeature_HTO', 'nFeature_CITE', 'nFeature_SCT', 'percent.mt', 'percent.largest.gene', 'coarse.cell.type', 'cell.type', 'HTO1', 'HTO2', 'HTO3'
    var: 'gene_ids', 'feature_types', 'Original_Name', 'Name_ADT_Fixed', 'Gene_ADT_Fixed', 'Clone_ADT_Fixed'

In [7]:
sorted(adata.obs['coarse.cell.type'].unique())

['B',
 'CD4',
 'CD8',
 'HSPC',
 'ILC',
 'MAIT',
 'Mac-or-Mono',
 'Mono',
 'Mono-T-dblt',
 'NK',
 'Neut',
 'Plasmablast',
 'Platelet',
 'cDC',
 'gdT-Vd2',
 'pDC']

In [11]:
sorted(adata.obs[adata.obs['coarse.cell.type']=='B']['cell.type'].unique())

['B_Mem', 'B_Naive', 'B_Naive_Intermediate']

In [12]:
sorted(adata.obs[adata.obs['coarse.cell.type']=='Plasmablast']['cell.type'].unique())

['Plasmablast']

In [21]:
ind = adata.obs[adata.obs['coarse.cell.type'].isin(['B', 'Plasmablast'])].index

In [24]:
adata_B = adata[ind, :].copy()

In [25]:
adata_B

AnnData object with n_obs × n_vars = 43632 × 28540
    obs: 'alt.subject.id', 'visit', 'covid.diagnosis.start.date.to.sample.drawn', 'group', 'sex', 'age', 'race', 'ethnicity', 'flu.vax.count.10yr', 'vaccine', 'vaccine.side.effects', 'long.covid.symptoms', 'Batch', 'nCount_RNA', 'nCount_HTO', 'nCount_CITE', 'nCount_RNA_largest.gene', 'nCount_SCT', 'nFeature_RNA', 'nFeature_HTO', 'nFeature_CITE', 'nFeature_SCT', 'percent.mt', 'percent.largest.gene', 'coarse.cell.type', 'cell.type', 'HTO1', 'HTO2', 'HTO3'
    var: 'gene_ids', 'feature_types', 'Original_Name', 'Name_ADT_Fixed', 'Gene_ADT_Fixed', 'Clone_ADT_Fixed'

In [26]:
sc.write(filename = '../../../Data/PerDataset/Sparks2023/B_fromSeurat.ADTfixed.h5ad',
        adata = adata_B)