In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

def save_df_to_npz(obj, filename):
    np.savez_compressed(filename, data=obj.values, index=obj.index.values, columns=obj.columns.values)

def load_df_from_npz(filename):
    with np.load(filename, allow_pickle=True) as f:
        obj = pd.DataFrame(**f)
    return obj

## Hao et al - T-cells

In [8]:
mergefn = "../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.h5ad" # AnnData containing RNA +/- ADT
feature_type_col = 'feature_types' # Set to None if only RNA is included
adt_feature_name = 'Antibody Capture' # all other genes assumed to be RNA
adata = sc.read(mergefn)

Only considering the two last: ['.T', '.h5ad'].
Only considering the two last: ['.T', '.h5ad'].


In [9]:
hao_var_adt = adata.var
hao_var_adt = hao_var_adt.loc[hao_var_adt[feature_type_col]==adt_feature_name, :]
hao_var_adt['Name'] = [x[3:] for x in hao_var_adt.index]
hao_var_adt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hao_var_adt['Name'] = [x[3:] for x in hao_var_adt.index]


Unnamed: 0,features,Clone,Specificity,feature_types,Name
AB_CD39,AB_CD39,A1,CD39,Antibody Capture,CD39
AB_Rat-IgG1-1,AB_Rat-IgG1-1,RTK2071,"Rat IgG1, Œ∫ Isotype Control",Antibody Capture,Rat-IgG1-1
AB_CD107a,AB_CD107a,H4A3,CD107a (LAMP-1),Antibody Capture,CD107a
AB_CD62P,AB_CD62P,AK4,CD62P (P-Selectin),Antibody Capture,CD62P
AB_TCR-2,AB_TCR-2,IP26,TCR a/Œ≤,Antibody Capture,TCR-2


In [10]:
hao_var_adt['Clone'].value_counts().head()

BA5b      2
HP-MA4    1
IA6-2     1
ICRF44    1
IM7       1
Name: Clone, dtype: int64

In [11]:
hao_var_adt.loc[hao_var_adt['Clone']=='BA5b', :]

Unnamed: 0,features,Clone,Specificity,feature_types,Name
AB_CD26-2,AB_CD26-2,BA5b,CD26,Antibody Capture,CD26-2
AB_CD26-1,AB_CD26-1,BA5b,CD26,Antibody Capture,CD26-1


In [20]:
! cp /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/HaoEtAl_PBMC/abinfo_1-s2.0-S0092867421005833-mmc1.xlsx ../../Data/PerDataset/HaoEtAl/

In [21]:
abdata = pd.ExcelFile('../../Data/PerDataset/HaoEtAl/abinfo_1-s2.0-S0092867421005833-mmc1.xlsx').parse()
abdata.index = [x.replace('_', '-') for x in abdata['#protein']]
abdata.head()

Unnamed: 0,#protein,Sequence,Category,Catalog,Specificity,Clone,Reactivity,Ensembl Gene Id
B7-H4,B7-H4,TGTATGTCTGCCTTG,TotalSeq-A,358114.0,B7-H4,MIH43,Human,ENSG00000134258
C5L2,C5L2,ACAATTTGTCTGCGA,TotalSeq-A,342407.0,C5L2,1D9-M12,Human,ENSG00000134830
Cadherin,Cadherin,CGTTGCCATTAACCA,TotalSeq-A,368715.0,Cadherin 11,16G5,Human,ENSG00000140937
CCR10,CCR10,ATCTGTATGTCACAG,,,CCR10,6588-5,Human,ENSG00000184451
CD102,CD102,TGACCTTCCTCTCCT,TotalSeq-A,328509.0,CD102,CBR-IC2/2,Human,ENSG00000108622


In [22]:
hao_var_adt_merged = pd.merge(left=hao_var_adt.drop(['Clone', 'Specificity'], axis=1), right=abdata, left_on='Name', right_index=True) # ['#protein'].isnull().value_counts()
hao_var_adt_merged.head()

Unnamed: 0,features,feature_types,Name,#protein,Sequence,Category,Catalog,Specificity,Clone,Reactivity,Ensembl Gene Id
AB_CD39,AB_CD39,Antibody Capture,CD39,CD39,TTACCTGGTATCCGT,TotalSeq-A,328233.0,CD39,A1,Human,ENSG00000138185
AB_Rat-IgG1-1,AB_Rat-IgG1-1,Antibody Capture,Rat-IgG1-1,Rat_IgG1_1,ATCAGATGCCCTCAT,TotalSeq-A,400459.0,"Rat IgG1, Œ∫ Isotype Control",RTK2071,,
AB_CD107a,AB_CD107a,Antibody Capture,CD107a,CD107a,CAGCCCACTGCAATA,TotalSeq-A,328647.0,CD107a (LAMP-1),H4A3,Human,ENSG00000185896
AB_CD62P,AB_CD62P,Antibody Capture,CD62P,CD62P,CCTTCCGTATCCCTT,TotalSeq-A,304933.0,CD62P (P-Selectin),AK4,Human,ENSG00000174175
AB_TCR-2,AB_TCR-2,Antibody Capture,TCR-2,TCR_2,CGTAACGTAGAGCGA,TotalSeq-A,306737.0,TCR a/Œ≤,IP26,Human,


In [23]:
hao_var_adt.shape, hao_var_adt_merged.shape

((228, 5), (228, 11))

In [24]:
hao_var_adt_merged['Catalog'] = hao_var_adt_merged['Catalog'].replace(np.nan, -1).astype(int)

In [25]:
hao_var_adt_merged

Unnamed: 0,features,feature_types,Name,#protein,Sequence,Category,Catalog,Specificity,Clone,Reactivity,Ensembl Gene Id
AB_CD39,AB_CD39,Antibody Capture,CD39,CD39,TTACCTGGTATCCGT,TotalSeq-A,328233,CD39,A1,Human,ENSG00000138185
AB_Rat-IgG1-1,AB_Rat-IgG1-1,Antibody Capture,Rat-IgG1-1,Rat_IgG1_1,ATCAGATGCCCTCAT,TotalSeq-A,400459,"Rat IgG1, Œ∫ Isotype Control",RTK2071,,
AB_CD107a,AB_CD107a,Antibody Capture,CD107a,CD107a,CAGCCCACTGCAATA,TotalSeq-A,328647,CD107a (LAMP-1),H4A3,Human,ENSG00000185896
AB_CD62P,AB_CD62P,Antibody Capture,CD62P,CD62P,CCTTCCGTATCCCTT,TotalSeq-A,304933,CD62P (P-Selectin),AK4,Human,ENSG00000174175
AB_TCR-2,AB_TCR-2,Antibody Capture,TCR-2,TCR_2,CGTAACGTAGAGCGA,TotalSeq-A,306737,TCR a/Œ≤,IP26,Human,
...,...,...,...,...,...,...,...,...,...,...,...
AB_CD164,AB_CD164,Antibody Capture,CD164,CD164,GAGGCACTTAACATA,TotalSeq-A,324809,CD164,67D2,Human,ENSG00000135535
AB_CD138-2,AB_CD138-2,Antibody Capture,CD138-2,CD138_2,GTATAGACCAAAGCC,TotalSeq-A,352325,CD138 (Syndecan-1),DL-101,Human,ENSG00000115884
AB_CD144,AB_CD144,Antibody Capture,CD144,CD144,TCCACTCATTCTGTA,TotalSeq-A,348517,CD144 (VE-cadherin),BV9,Human,ENSG00000179776
AB_CD202b,AB_CD202b,Antibody Capture,CD202b,CD202b,CGATCCCTTACCTAT,TotalSeq-A,334213,CD202b (Tie2/Tek),33.1 (Ab33),Human,ENSG00000120156


## Clone name corrections

AB_CD207
1000:'10E2'


AB_CD90
50000000000:'5E10'

AB_CD193
500000000:'5E8'


In [26]:
hao_var_adt_merged['Clone'] = hao_var_adt_merged['Clone'].replace({'1000':'10E2', '50000000000':'5E10', '500000000':'5E8'})

In [27]:
hao_var_adt_merged.to_csv('../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.ADTinfo.tsv', sep='\t', index=False)

In [28]:
hao_var_adt_merged = pd.read_csv('../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.ADTinfo.tsv', sep='\t')
hao_var_adt_merged.head()

Unnamed: 0,features,feature_types,Name,#protein,Sequence,Category,Catalog,Specificity,Clone,Reactivity,Ensembl Gene Id
0,AB_CD39,Antibody Capture,CD39,CD39,TTACCTGGTATCCGT,TotalSeq-A,328233,CD39,A1,Human,ENSG00000138185
1,AB_Rat-IgG1-1,Antibody Capture,Rat-IgG1-1,Rat_IgG1_1,ATCAGATGCCCTCAT,TotalSeq-A,400459,"Rat IgG1, Œ∫ Isotype Control",RTK2071,,
2,AB_CD107a,Antibody Capture,CD107a,CD107a,CAGCCCACTGCAATA,TotalSeq-A,328647,CD107a (LAMP-1),H4A3,Human,ENSG00000185896
3,AB_CD62P,Antibody Capture,CD62P,CD62P,CCTTCCGTATCCCTT,TotalSeq-A,304933,CD62P (P-Selectin),AK4,Human,ENSG00000174175
4,AB_TCR-2,Antibody Capture,TCR-2,TCR_2,CGTAACGTAGAGCGA,TotalSeq-A,306737,TCR a/Œ≤,IP26,Human,


In [29]:
hao_var_adt_merged.loc[hao_var_adt_merged['Clone'] =='BA5b', :]

Unnamed: 0,features,feature_types,Name,#protein,Sequence,Category,Catalog,Specificity,Clone,Reactivity,Ensembl Gene Id
103,AB_CD26-2,Antibody Capture,CD26-2,CD26_2,TTCCTGCACGAGGAT,,-1,CD26,BA5b,Human,ENSG00000197635
105,AB_CD26-1,Antibody Capture,CD26-1,CD26_1,GGTGGCTAGATAATG,TotalSeq-A,302720,CD26,BA5b,Human,ENSG00000197635


## Stephenson et al

In [33]:
mergefn = '../../Data/PerDataset/UKCOVID/haniffa21.RNAandADT.WNN.reprocessedDAK4.h5ad' 
adata = sc.read(mergefn)

Only considering the two last: ['.reprocessedDAK4', '.h5ad'].
Only considering the two last: ['.reprocessedDAK4', '.h5ad'].


In [34]:
adata

AnnData object with n_obs × n_vars = 612982 × 16002
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'pct_mito', 'IEG_GEP_Score', 'n_counts', 'FOS_expr', 'JUN_expr', 'JUNB_expr', 'ZFP36_expr', 'IEG_Usage', 'RNA_Modality_Weight', 'leiden_WNN', 'full_clustering_reduced', 'leiden_WNN_lab', 'leiden_WNN_lab_reduced'
    var: 'feature_types', 'n_cells', 'mt_filter', 'dot_filter', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'MI', 'MI_Rank'
    uns: 'Site_colors', 'full_clustering_reduced_colors', 'hvg', 'initial_clustering_colors', 'leiden_WNN_colors', 'leiden_WNN_lab_colors', 'leiden_WNN_lab_reduced_colors', 'neighbors', 'pca', 'rank_

In [35]:
feature_type_col = 'feature_types' # Set to None if only RNA is included
adt_feature_name = 'Antibody Capture' # all other genes assumed to be RNA

In [36]:
stephenson_var_adt = adata.raw.var
stephenson_var_adt = stephenson_var_adt.loc[stephenson_var_adt[feature_type_col]==adt_feature_name, :]
stephenson_var_adt['Name'] = list(stephenson_var_adt.index)
stephenson_var_adt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stephenson_var_adt['Name'] = list(stephenson_var_adt.index)


Unnamed: 0,feature_types,ADT_MI,ADT_MI_Rank,IEG_Filter,IEG_GEP_Score,Name
AB_CD80,Antibody Capture,0.008640,174.0,False,,AB_CD80
AB_CD86,Antibody Capture,0.105782,51.0,False,,AB_CD86
AB_CD274,Antibody Capture,0.014578,134.0,False,,AB_CD274
AB_PDCD1LG2,Antibody Capture,0.011461,151.0,False,,AB_PDCD1LG2
AB_ICOSLG,Antibody Capture,0.012773,144.0,False,,AB_ICOSLG
...,...,...,...,...,...,...
AB_Podocalyxin,Antibody Capture,0.008314,178.0,False,,AB_Podocalyxin
AB_GGT1,Antibody Capture,0.115249,47.0,False,,AB_GGT1
AB_c-Met,Antibody Capture,0.011313,155.0,False,,AB_c-Met
AB_LIGHT,Antibody Capture,0.008453,177.0,False,,AB_LIGHT


In [49]:
! ls /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COVID_PBMC/Haniffa

cNMF_Haniffa_NK_RNA_HarmonyGeneCorrect
cNMF_Haniffa_Tpubclust_RNA_HarmonyGeneCorrect
cNMF_RNA_Tpubclust_Filt20220224
cNMF_TpubClust_RNA_ADT
cNMF_TpubClust_RNA_ADT_Corrected
haniffa21.NK.ADT_and_RNA.CLR_and_TP10K.h5ad
haniffa21.NK.ADT.TP10K.VarNorm.leidenRNA_MI.tsv
haniffa21.NK.RNA.TP10K.VarNorm.HVG.h5ad
haniffa21.NK.RNA.VarNorm.HVG.HarmonyGeneCorrect.h5ad
haniffa21.RNAandADT.WNN.reprocessedDAK4.ADTinfo.tsv
haniffa21.T.ADT.TP10K.VarNorm.leidenRNA_MI.tsv
haniffa21.TpubClust.ADT_RNA.rawForcNMF.h5ad
haniffa21.TpubClust.ADT_RNA.rawForcNMF.norm.h5ad
haniffa21.TpubClust.ADT_RNA.rawForcNMF.norm.HarmonyGene.h5ad
haniffa21.TpubClust.ADT_RNA.rawForcNMF.norm.HarmonyGene.NoZeros.h5ad
haniffa21.T.RNA.TP10K.VarNorm.HVG.h5ad
haniffa21.T.RNA.VarNorm.HVG.HarmonyGeneCorrect.h5ad
StephensonEtAl_SupplementaryTables_41591_2021_1329_MOESM3_ESM.xlsx
TotalSeq_C_Human_Universal_Cocktail_v1_137_Antibodies_399905_Barcodes_Cleaned.xlsx


In [52]:
! cp /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COVID_PBMC/Haniffa/StephensonEtAl_SupplementaryTables_41591_2021_1329_MOESM3_ESM.xlsx ../../Data/PerDataset/UKCOVID
! cp /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COVID_PBMC/Haniffa/TotalSeq_C_Human_Universal_Cocktail_v1_137_Antibodies_399905_Barcodes_Cleaned.xlsx ../../Data/PerDataset/

In [51]:
stephenson_map = pd.ExcelFile('../../Data/PerDataset/UKCOVID/StephensonEtAl_SupplementaryTables_41591_2021_1329_MOESM3_ESM.xlsx').parse('Supp Table 1', skiprows=1)
stephenson_map.head()

Unnamed: 0,Antibody,Clone,Barcode
0,anti-human CD80,2D10,ACGAATCAATCTGTG
1,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA
2,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC
3,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG
4,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA


In [53]:
map2 = pd.ExcelFile('../../Data/PerDataset/TotalSeq_C_Human_Universal_Cocktail_v1_137_Antibodies_399905_Barcodes_Cleaned.xlsx').parse(skiprows=1).iloc[:, 1:]
map2.head()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name
0,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86
1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274
2,C0020,"anti-human CD270 (HVEM, TR2)",122,TGATAGAAACAGACC,ENSG00000157873,TNFRSF14
3,C0023,anti-human CD155 (PVR),SKII.4,ATCACATCGTTGCCA,ENSG00000073008,PVR
4,C0024,anti-human CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,ENSG00000130202,NECTIN2


TCR clone mapping
 - anti-human TCR Œ±/Œ≤	IP26 AB_TCR
 - anti-human TCR VŒ¥2	B6  AB_TCR_Vg2
 - anti-human TCR Œ≥/Œ¥	B1 AB_TCRg_d
 - anti-human TCR VŒ±7.2	3C10 AB_TCR_Va7.2'
 - anti-human TCR VŒ≥9	B3 AB_TCR_Vg9
 - anti-human TCR VŒ±24-JŒ±18 (iNKT cell)	6B11 AB_TCR_Va24-Ja18
 - anti-human TCR VŒ≤13.1	H131 AB_TCR_VB_13_1

In [54]:
fillin = {'anti-human CD370 (CLEC9A/DNGR1)':'AB_CLEC9A',
'anti-human CD158f (KIR2DL5)':'AB_KIR2DL5A',
'anti-human CD141 (Thrombomodulin)':'AB_CD141',
'anti-human HLA-DR':'AB_HLA-DR', 
'anti-human CD158b (KIR2DL2/L3,  NKAT2)':'AB_CD158b',
'anti-human CD197 (CCR7)':'AB_CCR7',
'anti-human CD161':'AB_CD161',
'anti-human CD307e (FcRL5)':'AB_FCRL5',
'anti-Tau Phospho (Thr181)':'AB_phosphoTau',
'anti-human CD257 (BAFF, BLYS)':'AB_BAFF',
 'anti-human IgM':'AB_IgM',
 'anti-human CD324 (E-Cadherin)':'AB_CDH1',
 'anti-human CD150 (SLAM)':'AB_SLAMF1',
 'anti-human CD305 (LAIR1)':'AB_LAIR1',
 'anti-human CD184 (CXCR4)':'AB_CXCR4',
 'anti-human IgG Fc':'AB_IgG_Fc',
 'anti-human IgD': 'AB_IgD',
 'anti-human CD254 (TRANCE, RANKL)':'AB_RANKL',
 'anti-human CD45':'AB_CD45',
 'anti-human CD117 (c-kit)':'AB_KIT',
 'anti-human HLA-A,B,C':'AB_HLA-ABC',
    'anti-human CD307d (FcRL4)':'AB_FCRL4',
    'anti-human CD62L':'AB_CD62L',
    'anti-human CD357 (GITR)':'AB_GITR',
    'anti-human CD326 (Ep-CAM)':'AB_EPCAM',
    'anti-human HLA-A2':'AB_HLA-A_2',
    'anti-human CD20':'AB_CD20',
    'anti-human GARP (LRRC32)':'AB_LRRC32',
    'anti-human CD279 (PD-1)':'AB_PD1',
    'anti-human CD158 (KIR2DL1/S1/S3/S5)':'AB_CD158',
    'anti-human CD303 (BDCA-2)':'AB_CD303',
    'anti-human CD45RO':'AB_CD45RO',
    'anti-human CD3':'AB_CD3',
    'anti-human CD16':'AB_CD16',
    'anti-human CD8':'AB_CD8',
    'anti-human CD258 (LIGHT)':'AB_LIGHT',
    'anti-c-Met' : 'AB_c-Met',
    'anti-human CD267 (TACI)':'AB_TACI',
    'anti-human CD90 (Thy1)':'AB_THY1',
    'anti-human CD45RA':'AB_CD45RA',
    'anti-human CD123':'AB_CD123',
    'anti-human CD71':'AB_CD71',
    'anti-human CD193 (CCR3)':'AB_CCR3',
    'anti-human CD25':'AB_CD25',
    'anti-human CD144 (VE-Cadherin)':'AB_CDH5',
    'anti-human CD294 (CRTH2)':'AB_PTGDR2',
    'anti-human CD138 (Syndecan-1)':'AB_SDC1',
    'anti-human CD319 (CRACC)':'AB_SLAMF7',
    'anti-human CD336 (NKp44)':'AB_NCR2',
    'anti-human CD337 (NKp30)':'AB_NCR3',
    'anti-mouse/human CD207': 'AB_langerin',
    'anti-human CD66b':'AB_CEACAM8',
    'anti-human CD269 (BCMA)':'AB_TNFRSF17',
    'anti-mouse/human Mac-2 (Galectin-3)':'AB_LGALS3',
    'anti-human CD268 (BAFF-R)':'AB_BAFFR',
    'anti-human CD106':'AB_VCAM1',
    'anti-human CD309 (VEGFR2)':'AB_KDR',
    'anti-human CD273  (B7-DC, PD-L2)':'AB_PDCD1LG2',
    'anti-human CD275 (B7-H2, ICOSL)':'AB_ICOSLG',
    'anti-mouse/human CD11b':'AB_ITGAM',
    'anti-human CD252 (OX40L)':'AB_OX40L',
    'anti-human CD137L (4-1BB Ligand)':'AB_TNFSF9',
    'anti-human CD360 (IL-21R)':'AB_IL21R',
    'anti-human CD366 (Tim-3)':'AB_HAVCR2',
    'anti-human CD30':'AB_TNFRSF8',
    'anti-human CD66a/c/e':'AB_CEACAM1/5/6',
    'anti-human CD98':'AB_SLC3A2',
    'anti-human CD178 (Fas-L)':'AB_FASLG',
    'anti-human CD56 (NCAM)':'AB_CD56',
    'anti-human CD10':'AB_MME',
    'anti-human FcεRIα':'AB_FcERIa',
    'anti-human Ig light chain κ':'AB_Igkappa',
    'anti-human CD21':'AB_CD21',
    'anti-human CD206 (MMR)':'AB_MMR',
    'anti-human CD204':'AB_MSR1',
    'anti-human CD133':'AB_PROM1',
    'anti-human CD79b (Igβ)':'AB_CD79b',
    'anti-Human Podoplanin':'AB_podoplanin'
        
}

fillin_clone = {'IP26':'AB_TCR',
'B6':'AB_TCR_Vg2',
'B1':'AB_TCRg_d',
'3C10':'AB_TCR_Va7.2',
'B3':'AB_TCR_Vg9',
'6B11':'AB_TCR_Va24-Ja18',
'H131':'AB_TCR_VB_13_1'}

In [55]:
stephenson_map = pd.merge(left=stephenson_map, right=map2.drop('Barcode', axis=1), on='Clone', how='left')


stephenson_map['Final_Name'] = np.nan

ind = stephenson_map['Antibody'].isin(fillin.keys())
stephenson_map.loc[ind, 'Final_Name'] = stephenson_map['Antibody'].replace(fillin)
ind = stephenson_map['Clone'].isin(fillin_clone.keys())
stephenson_map.loc[ind, 'Final_Name'] = stephenson_map['Clone'].replace(fillin_clone)

ind = stephenson_map['Final_Name'].isnull() & ~stephenson_map['Gene name'].isnull()
stephenson_map.loc[ind, 'Final_Name'] = stephenson_map.loc[ind, 'Gene name']
stephenson_map.head()

Unnamed: 0,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID,Gene name,Final_Name
0,anti-human CD80,2D10,ACGAATCAATCTGTG,,,,,
1,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,C0006,anti-human CD86,ENSG00000114013,CD86,CD86
2,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",ENSG00000120217,CD274,CD274
3,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG,,,,,AB_PDCD1LG2
4,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA,,,,,AB_ICOSLG


In [56]:
stephenson_map['Name2'] = stephenson_map['Antibody'].apply(lambda x: x.replace('anti-human ', '').replace('anti-Human ', '').replace(', κ Isotype Ctrl', '_K_Iso').split(' (')[0])
stephenson_map['Name2'] = stephenson_map['Name2'].apply(lambda x: x.replace('anti-human/mouse ', '').replace('anti-mouse/human ', '').replace('anti-human/mouse/rat ', ''))
stephenson_map['Name2'] = stephenson_map['Name2'].apply(lambda x: x.split(' ')[0])
stephenson_map['Name2'] = stephenson_map['Name2'].apply(lambda x: x.replace(' ', '_'))
stephenson_map.head()

Unnamed: 0,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID,Gene name,Final_Name,Name2
0,anti-human CD80,2D10,ACGAATCAATCTGTG,,,,,,CD80
1,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,C0006,anti-human CD86,ENSG00000114013,CD86,CD86,CD86
2,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",ENSG00000120217,CD274,CD274,CD274
3,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG,,,,,AB_PDCD1LG2,CD273
4,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA,,,,,AB_ICOSLG,CD275


In [57]:
ind = stephenson_map['Final_Name'].isnull()
stephenson_map.loc[ind, 'Final_Name'] = stephenson_map.loc[ind, 'Name2']

In [58]:
ind = stephenson_map['Final_Name'].apply(lambda x: x[:3] != 'AB_')
stephenson_map.loc[ind, 'Final_Name'] = 'AB_' + stephenson_map.loc[ind, 'Final_Name']

In [59]:
stephenson_map['Final_Name_ForMerge'] = stephenson_map['Final_Name'].apply(lambda x: x.upper())

In [60]:
stephenson_map.head()

Unnamed: 0,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID,Gene name,Final_Name,Name2,Final_Name_ForMerge
0,anti-human CD80,2D10,ACGAATCAATCTGTG,,,,,AB_CD80,CD80,AB_CD80
1,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,C0006,anti-human CD86,ENSG00000114013,CD86,AB_CD86,CD86,AB_CD86
2,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",ENSG00000120217,CD274,AB_CD274,CD274,AB_CD274
3,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG,,,,,AB_PDCD1LG2,CD273,AB_PDCD1LG2
4,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA,,,,,AB_ICOSLG,CD275,AB_ICOSLG


In [61]:
stephenson_var_adt['NameForMerge'] = stephenson_var_adt['Name'].apply(lambda x: x.upper())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stephenson_var_adt['NameForMerge'] = stephenson_var_adt['Name'].apply(lambda x: x.upper())


In [62]:
merged_tmp = pd.merge(left=stephenson_map, right=stephenson_var_adt, how='left', left_on='Final_Name_ForMerge', right_on='NameForMerge')

In [63]:
merged_tmp['feature_types'].isnull().value_counts()

False    192
Name: feature_types, dtype: int64

In [64]:
merged_tmp.head()

Unnamed: 0,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID,Gene name,Final_Name,Name2,Final_Name_ForMerge,feature_types,ADT_MI,ADT_MI_Rank,IEG_Filter,IEG_GEP_Score,Name,NameForMerge
0,anti-human CD80,2D10,ACGAATCAATCTGTG,,,,,AB_CD80,CD80,AB_CD80,Antibody Capture,0.00864,174.0,False,,AB_CD80,AB_CD80
1,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,C0006,anti-human CD86,ENSG00000114013,CD86,AB_CD86,CD86,AB_CD86,Antibody Capture,0.105782,51.0,False,,AB_CD86,AB_CD86
2,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",ENSG00000120217,CD274,AB_CD274,CD274,AB_CD274,Antibody Capture,0.014578,134.0,False,,AB_CD274,AB_CD274
3,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG,,,,,AB_PDCD1LG2,CD273,AB_PDCD1LG2,Antibody Capture,0.011461,151.0,False,,AB_PDCD1LG2,AB_PDCD1LG2
4,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA,,,,,AB_ICOSLG,CD275,AB_ICOSLG,Antibody Capture,0.012773,144.0,False,,AB_ICOSLG,AB_ICOSLG


In [65]:
merged_tmp = merged_tmp.iloc[:,:12]

In [66]:
merged_tmp['Fixed_Name'] = ''
ind = ~merged_tmp['feature_types'].isnull()
merged_tmp.loc[ind, 'Fixed_Name'] = merged_tmp.loc[ind, 'Final_Name']
merged_tmp = merged_tmp.sort_values(by='Fixed_Name', ascending=True)
#merged_tmp.to_csv('tmp.tsv', sep='\t')

In [67]:
merged_tmp.loc[merged_tmp['feature_types'].isnull(), :]

Unnamed: 0,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID,Gene name,Final_Name,Name2,Final_Name_ForMerge,feature_types,ADT_MI,Fixed_Name


In [68]:
stephenson_var_adt.loc[~stephenson_var_adt['Name'].isin(merged_tmp['Final_Name']), :].sort_values(by='Name').head(17)

Unnamed: 0,feature_types,ADT_MI,ADT_MI_Rank,IEG_Filter,IEG_GEP_Score,Name,NameForMerge


In [69]:
merged_tmp['Final_Name'].isin(stephenson_var_adt['Name']).value_counts()

True    192
Name: Final_Name, dtype: int64

In [70]:
stephenson_var_adt['Name'].isin(merged_tmp['Final_Name']).value_counts()

True    192
Name: Name, dtype: int64

In [71]:
stephenson_var_adt_final = pd.merge(left=stephenson_var_adt.drop('NameForMerge', axis=1),
         right=merged_tmp[['Antibody', 'Clone', 'Barcode', 'DNA_ID', 'Description', 'Ensemble ID', 'Final_Name']],
         how='left', left_on='Name', right_on='Final_Name')

stephenson_var_adt_final = stephenson_var_adt_final.drop('Final_Name', axis=1)
stephenson_var_adt_final.head()

Unnamed: 0,feature_types,ADT_MI,ADT_MI_Rank,IEG_Filter,IEG_GEP_Score,Name,Antibody,Clone,Barcode,DNA_ID,Description,Ensemble ID
0,Antibody Capture,0.00864,174.0,False,,AB_CD80,anti-human CD80,2D10,ACGAATCAATCTGTG,,,
1,Antibody Capture,0.105782,51.0,False,,AB_CD86,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,C0006,anti-human CD86,ENSG00000114013
2,Antibody Capture,0.014578,134.0,False,,AB_CD274,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",ENSG00000120217
3,Antibody Capture,0.011461,151.0,False,,AB_PDCD1LG2,"anti-human CD273 (B7-DC, PD-L2)",24F.10C12,TCAACGCTTGGCTAG,,,
4,Antibody Capture,0.012773,144.0,False,,AB_ICOSLG,"anti-human CD275 (B7-H2, ICOSL)",2D3,GTGCATTCAACAGTA,,,


In [73]:
stephenson_var_adt_final.to_csv('../../Data/PerDataset/UKCOVID/haniffa21.RNAandADT.WNN.reprocessedDAK4.ADTinfo.tsv', sep='\t')

## Amp-RA

In [79]:
mergefn = '../../Data/PerDataset/AMPRA/AMP_ADT.RNA_counts.h5ad'
feature_type_col = 'feature_types' # Set to None if only RNA is included
adt_feature_name = 'Antibody Capture' # all other genes assumed to be RNA
adata = sc.read(mergefn)

In [80]:
adata.raw.var['feature_types'] = "Gene Expression" 
adata.raw.var.loc[adata.var['_index'].apply(lambda x: '-prot' in x), 'feature_types'] = 'Antibody Capture'

In [81]:
adata.raw.var.tail()

Unnamed: 0,_index,feature_types
33591,FR-beta-prot,Antibody Capture
33592,HLA-DR-prot,Antibody Capture
33593,IgG-Fc-prot,Antibody Capture
33594,IgM-prot,Antibody Capture
33595,Podoplanin-prot,Antibody Capture


In [82]:
ampra_var_adt = adata.raw.var
ampra_var_adt = ampra_var_adt.loc[ampra_var_adt[feature_type_col]==adt_feature_name, :]
ampra_var_adt.shape

(58, 2)

In [92]:
! cp /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/AMPRA/AmpRA_CiteSeq_Supplementary_table2_corrected.xlsx ../../Data/PerDataset/AMPRA

In [17]:
absummary = pd.ExcelFile('../../Data/PerDataset/AMPRA/AmpRA_CiteSeq_Supplementary_table2_corrected.xlsx').parse()
absummary.tail()

Unnamed: 0,Category,Barcode,Specificity,Clone,Barcode Sequence,Per sample,Vendor,ADT_Name
53,TotalSeq™-A,427,Folate Receptor β (FR-β),94b/FOLR2,TGTGGCTAGTCAGTT,1ug,BioLegend,FR-beta-prot
54,TotalSeq™-A,159,HLA-DR,L243,AATAGCGAGCAAGTA,1ug,BioLegend,HLA-DR-prot
55,TotalSeq™-A,375,IgG Fc,M1310G05,CTGGAGCGATTAGAA,1/5 DILUTION (0.2ug),BioLegend,IgG-Fc-prot
56,TotalSeq™-A,136,IgM,MHM-88,TAGCGAGCCCGTATA,1/5 DILUTION (0.2ug),BioLegend,IgM-prot
57,TotalSeq™-A,127,Podoplanin,NC-08,GGTTACTCGTTGTGT,1ug,BioLegend,Podoplanin-prot


In [94]:
ampra_var_adt_fullinfo = pd.merge(left=ampra_var_adt, right=absummary, left_on='_index', right_on='ADT_Name', how='left')
ampra_var_adt_fullinfo.head()

Unnamed: 0,_index,feature_types,Category,Barcode,Specificity,Clone,Barcode Sequence,Per sample,Vendor,ADT_Name
0,CD107a/LAMP1-prot,Antibody Capture,TotalSeq™-A,155,CD107a (LAMP-1),H4A3,CAGCCCACTGCAATA,1/5 DILUTION (0.2ug),BioLegend,CD107a/LAMP1-prot
1,CD112/Nectin-2-prot,Antibody Capture,TotalSeq™-A,24,CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,1ug,BioLegend,CD112/Nectin-2-prot
2,CD119/IFN-gamma-R-alpha-chain-prot,Antibody Capture,TotalSeq™-A,219,CD119 (IFN-γ R α chain),GIR-208,TGTGTATTCCCTTGT,1/5 DILUTION (0.2ug),BioLegend,CD119/IFN-gamma-R-alpha-chain-prot
3,CD11b-prot,Antibody Capture,TotalSeq™-A,161,CD11b,ICRF44,GACAAGTGATCTGCA,1ug,BioLegend,CD11b-prot
4,CD11c-prot,Antibody Capture,TotalSeq™-A,53,CD11c,S-HCL-3,TACGCCTATAACTTG,1ug,BioLegend,CD11c-prot


In [95]:
ampra_var_adt_fullinfo.shape, ampra_var_adt.shape

((58, 10), (58, 2))

In [96]:
ampra_var_adt_fullinfo.to_csv('../../Data/PerDataset/AMPRA/ampra_adt_features.tsv', sep='\t')

## Combat

In [97]:
feature_type_col = 'feature_types' # Set to None if only RNA is included
adt_feature_name = 'Antibody Capture' # all other genes assumed to be RNA

In [101]:
mergefn = '../../Data/PerDataset/COMBAT/COMBAT-CITESeq-DATA.Raw.T.h5ad' # AnnData containing RNA +/- ADT
adata = sc.read(mergefn)
adata

Only considering the two last: ['.T', '.h5ad'].
Only considering the two last: ['.T', '.h5ad'].


AnnData object with n_obs × n_vars = 400925 × 20807
    obs: 'Annotation_cluster_name', 'Annotation_minor_subset', 'Annotation_major_subset', 'Annotation_cell_type', 'GEX_region', 'QC_ngenes', 'QC_total_UMI', 'QC_pct_mitochondrial', 'QC_scrub_doublet_scores', 'TCR_chain_composition', 'TCR_clone_ID', 'TCR_clone_count', 'TCR_clone_proportion', 'TCR_contains_unproductive', 'TCR_doublet', 'TCR_chain_TRA', 'TCR_v_gene_TRA', 'TCR_d_gene_TRA', 'TCR_j_gene_TRA', 'TCR_c_gene_TRA', 'TCR_productive_TRA', 'TCR_cdr3_TRA', 'TCR_umis_TRA', 'TCR_chain_TRA2', 'TCR_v_gene_TRA2', 'TCR_d_gene_TRA2', 'TCR_j_gene_TRA2', 'TCR_c_gene_TRA2', 'TCR_productive_TRA2', 'TCR_cdr3_TRA2', 'TCR_umis_TRA2', 'TCR_chain_TRB', 'TCR_v_gene_TRB', 'TCR_d_gene_TRB', 'TCR_j_gene_TRB', 'TCR_c_gene_TRB', 'TCR_productive_TRB', 'TCR_chain_TRB2', 'TCR_v_gene_TRB2', 'TCR_d_gene_TRB2', 'TCR_j_gene_TRB2', 'TCR_c_gene_TRB2', 'TCR_productive_TRB2', 'TCR_cdr3_TRB2', 'TCR_umis_TRB2', 'COMBAT_ID', 'scRNASeq_sample_ID', 'COMBAT_participant_t

In [102]:
combat_adt_info = adata.var.loc[adata.var[feature_type_col]==adt_feature_name, :]
combat_adt_info['Name'] = combat_adt_info.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combat_adt_info['Name'] = combat_adt_info.index


In [103]:
combat_adt_info.head()

Unnamed: 0,gene_ids,feature_types,Name
AB_CD80,C0005,Antibody Capture,AB_CD80
AB_CD86,C0006,Antibody Capture,AB_CD86
AB_CD274_B7_H1_PD_L1,C0007,Antibody Capture,AB_CD274_B7_H1_PD_L1
AB_CD273_B7_DC_PD_L2,C0008,Antibody Capture,AB_CD273_B7_DC_PD_L2
AB_CD275_B7_H2_ICOSL,C0009,Antibody Capture,AB_CD275_B7_H2_ICOSL


In [104]:
! cp  /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COMBAT_PBMC/TotalSeq_C_Human_Universal_Cocktail_v1_137_Antibodies_399905_Barcodes.xlsx ../../Data/PerDataset/COMBAT/

In [105]:
abinfo = pd.ExcelFile('../../Data/PerDataset/COMBAT/TotalSeq_C_Human_Universal_Cocktail_v1_137_Antibodies_399905_Barcodes.xlsx').parse(skiprows=1).iloc[:, 1:]
abinfo.head()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name
0,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86
1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274
2,C0020,"anti-human CD270 (HVEM, TR2)",122,TGATAGAAACAGACC,ENSG00000157873,TNFRSF14
3,C0023,anti-human CD155 (PVR),SKII.4,ATCACATCGTTGCCA,ENSG00000073008,PVR
4,C0024,anti-human CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,ENSG00000130202,NECTIN2


In [106]:
combat_adt_info_merged = pd.merge(left=combat_adt_info, right=abinfo, left_on='gene_ids', right_on='DNA_ID', how='left')
combat_adt_info_merged.head()

Unnamed: 0,gene_ids,feature_types,Name,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name
0,C0005,Antibody Capture,AB_CD80,,,,,,
1,C0006,Antibody Capture,AB_CD86,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86
2,C0007,Antibody Capture,AB_CD274_B7_H1_PD_L1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274
3,C0008,Antibody Capture,AB_CD273_B7_DC_PD_L2,,,,,,
4,C0009,Antibody Capture,AB_CD275_B7_H2_ICOSL,,,,,,


In [107]:
combat_adt_info_merged['DNA_ID'].isnull().value_counts()

False    122
True      70
Name: DNA_ID, dtype: int64

In [108]:
combat_adt_info_merged_hit = combat_adt_info_merged.loc[~combat_adt_info_merged['DNA_ID'].isnull(), :]
combat_adt_info_merged_nohit = combat_adt_info_merged.loc[combat_adt_info_merged['DNA_ID'].isnull(), :]
combat_adt_info_merged_nohit = combat_adt_info_merged_nohit.drop(['DNA_ID', 'Description', 'Clone','Barcode','Ensemble ID','Gene name'], axis=1)
combat_adt_info_merged_nohit.head(20)

Unnamed: 0,gene_ids,feature_types,Name
0,C0005,Antibody Capture,AB_CD80
3,C0008,Antibody Capture,AB_CD273_B7_DC_PD_L2
4,C0009,Antibody Capture,AB_CD275_B7_H2_ICOSL
5,C0014,Antibody Capture,AB_humanCD11b
6,C0021,Antibody Capture,AB_CD252_OX40L
7,C0022,Antibody Capture,AB_CD137L_4_1BBLigand
11,C0027,Antibody Capture,AB_CD70
12,C0028,Antibody Capture,AB_CD30
22,C0054,Antibody Capture,AB_CD34
23,C0056,Antibody Capture,AB_CD269_BCMA


In [110]:
! cp  /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COMBAT_PBMC/Combat_Antibody_Info.xlsx ../../Data/PerDataset/COMBAT/

In [111]:
abdata2 = pd.ExcelFile('../../Data/PerDataset/COMBAT/Combat_Antibody_Info.xlsx').parse()
abdata2['Clone'] = abdata2['Description'].apply(lambda x: x.split(' (clone ')[-1][:-1])

abdata2['Name'] = abdata2['Description'].apply(lambda x: ' '.join(x.split(' (clone ')[0].split(' ')[1:]))
abdata2['Name'] = abdata2['Name'].apply(lambda x: x.replace(' (', '_').replace('-', '_').replace(', ', '_').replace(')', ''))

abdata2['Name'] = abdata2['Name'].replace({'CD11b':'humanCD11b', 'CD137L_4_1BB Ligand':'CD137L_4_1BBLigand', 'TCR γ/δ':'gdTCR',
                                          'TCR Vα24_Jα18_iNKT cell':'TCR_Va24_Ja18', 'TCR Vγ9':'TCR_Vg9', 'TCR Vβ13.1':'TCR_Vb13_1',
                                          'CD66a/c/e':'CD66a_c_e', 'CD370_CLEC9A/DNGR1':'CD370_CLEC9A_DNGR1',
                                          'IgG Fc':'IgGFc', 'Phospho_Thr181':'TauPhospho_Thr181', 'Mac_2_Galectin_3':'humanMac_2_Galectin_3',
                                          })

abdata2.loc[abdata2['Description']=='anti-c-Met (clone 12.1)', 'Name'] = 'c_Met'
abdata2['Name'] = 'AB_' + abdata2['Name']
abdata2.head()

Unnamed: 0,Description,Source,Catalog,Clone,Name
0,anti-human CD80 (clone 2D10),BioLegend,Cat# 99814,2D10,AB_CD80
1,anti-human CD86 (clone IT2.2),BioLegend,Cat# 99814,IT2.2,AB_CD86
2,"anti-human CD274 (B7-H1, PD-L1) (clone 29E.2A3)",BioLegend,Cat# 99814,29E.2A3,AB_CD274_B7_H1_PD_L1
3,"anti-human CD273 (B7-DC, PD-L2) (clone 24F.10C12)",BioLegend,Cat# 99814,24F.10C12,AB_CD273_B7_DC_PD_L2
4,"anti-human CD275 (B7-H2, ICOSL) (clone 2D3)",BioLegend,Cat# 99814,2D3,AB_CD275_B7_H2_ICOSL


In [112]:
combat_adt_info_merged_nohit.loc[~combat_adt_info_merged_nohit['Name'].isin(abdata2['Name']), :]

Unnamed: 0,gene_ids,feature_types,Name


In [113]:
combat_adt_info_merged_nohit_fixed = pd.merge(left=combat_adt_info_merged_nohit, right=abdata2, on='Name', how='left')
combat_adt_info_merged_nohit_fixed.head()

Unnamed: 0,gene_ids,feature_types,Name,Description,Source,Catalog,Clone
0,C0005,Antibody Capture,AB_CD80,anti-human CD80 (clone 2D10),BioLegend,Cat# 99814,2D10
1,C0008,Antibody Capture,AB_CD273_B7_DC_PD_L2,"anti-human CD273 (B7-DC, PD-L2) (clone 24F.10C12)",BioLegend,Cat# 99814,24F.10C12
2,C0009,Antibody Capture,AB_CD275_B7_H2_ICOSL,"anti-human CD275 (B7-H2, ICOSL) (clone 2D3)",BioLegend,Cat# 99814,2D3
3,C0014,Antibody Capture,AB_humanCD11b,anti-mouse/human CD11b (clone M1/70),BioLegend,Cat# 99814,M1/70
4,C0021,Antibody Capture,AB_CD252_OX40L,anti-human CD252 (OX40L) (clone 11C3.1),BioLegend,Cat# 99814,11C3.1


In [114]:
combat_adt_info_merged_hit.head()

Unnamed: 0,gene_ids,feature_types,Name,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name
1,C0006,Antibody Capture,AB_CD86,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86
2,C0007,Antibody Capture,AB_CD274_B7_H1_PD_L1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274
8,C0023,Antibody Capture,AB_CD155_PVR,C0023,anti-human CD155 (PVR),SKII.4,ATCACATCGTTGCCA,ENSG00000073008,PVR
9,C0024,Antibody Capture,AB_CD112_Nectin_2,C0024,anti-human CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,ENSG00000130202,NECTIN2
10,C0026,Antibody Capture,AB_CD47,C0026,anti-human CD47,CC2C6,GCATTCTGTCACCTA,ENSG00000196776,CD47


In [115]:
for c in ['DNA_ID', 'Barcode', 'Ensemble ID', 'Gene name']:
    combat_adt_info_merged_nohit_fixed[c] = np.nan

combat_adt_info_final = pd.concat([combat_adt_info_merged_hit,
           combat_adt_info_merged_nohit_fixed[['gene_ids', 'feature_types', 'Name', 'DNA_ID', 'Description', 'Clone',  'Barcode', 'Ensemble ID', 'Gene name']]
          ],
           axis=0)

In [116]:
combat_adt_info_final.to_csv('../../Data/PerDataset/COMBAT/Combat_ADT_info_final.tsv', sep='\t', index=False)

## TBRU

In [118]:
tbru = sc.read('../../Data/PerDataset/TBRU/tbru_exprs_raw.h5ad')

In [147]:
adts = [x for x in tbru.var.index if '_protein' in x]
tbru_adt_info = pd.DataFrame([adts], columns=adts).T
tbru_adt_info.columns = ['Dataset_Name']
tbru_adt_info['Target'] = tbru_adt_info['Dataset_Name'].apply(lambda x: x.split('_')[0])
tbru_adt_info.sort_index(axis=0)

Unnamed: 0,Dataset_Name,Target
CD127/IL-7R_protein,CD127/IL-7R_protein,CD127/IL-7R
CD161_protein,CD161_protein,CD161
CD183/CXCR3_protein,CD183/CXCR3_protein,CD183/CXCR3
CD194/CCR4_protein,CD194/CCR4_protein,CD194/CCR4
CD195/CCR5_protein,CD195/CCR5_protein,CD195/CCR5
CD196/CCR6_protein,CD196/CCR6_protein,CD196/CCR6
CD197/CCR7_protein,CD197/CCR7_protein,CD197/CCR7
CD244/2B4_protein,CD244/2B4_protein,CD244/2B4
CD25_protein,CD25_protein,CD25
CD26_protein,CD26_protein,CD26


In [148]:
adt_supp = pd.ExcelFile('../../Data/PerDataset/TBRU/Cleaned_41590_2021_933_MOESM4_ESM.xlsx').parse()
adt_supp.head()

Unnamed: 0,Dataset_Name,Supp_Target,Supp_Clone,Supp_Catalog,Supp_Concentration,Supp_Relevance,Supp_Notes
0,CD127/IL-7R_protein,CD127 (IL7R),A019D5,351352,0.25,Treg negative marker,
1,CD161_protein,CD161 (KLRB1),HP-3G10,339945,0.25,Th17/NK innate marker,
2,CD183/CXCR3_protein,CD183 (CXCR3),G025H7,353745,0.5,Th1/T Cell Homing,
3,CD194/CCR4_protein,CD194 (CCR4),L291H4,359423,0.25,Th2/Homing,
4,CD195/CCR5_protein,CD195 (CCR5),J418F1,359135,0.25,Homing,


In [149]:
tbru_adt_info = pd.merge(left=tbru_adt_info, right=adt_supp, on='Dataset_Name', how='left')

In [150]:
tbru_adt_info.to_csv('../../Data/PerDataset/TBRU/tbru_adt_info_merged.tsv', sep='\t')

## Sparks - T-cells

In [2]:
mergefn = '../../Data/PerDataset/Sparks2023/T_fromSeurat.h5ad'
adata = sc.read(mergefn)

In [3]:
feature_type_col = 'feature_types' # Set to None if only RNA is included
adt_feature_name = 'Antibody Capture' # all other genes assumed to be RNA

In [8]:
sparks_var_adt = adata.var
sparks_var_adt = sparks_var_adt.loc[sparks_var_adt[feature_type_col]==adt_feature_name, :]
sparks_var_adt.shape

(138, 2)

In [11]:
sparks_var_adt.tail()

Unnamed: 0,gene_ids,feature_types
AB_CD57,AB_CD57,Antibody Capture
AB_CD303,AB_CD303,Antibody Capture
AB_CD226,AB_CD226,Antibody Capture
AB_CD169,AB_CD169,Antibody Capture
AB_S1probe,AB_S1probe,Antibody Capture


In [18]:
absummary = pd.ExcelFile('../../Data/PerDataset/Sparks2023/Cleaned_41586_2022_5670_MOESM14_ESM.xlsx').parse()
absummary.tail()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name
134,C0944,anti-human CD101 (BB27),BB27,CTACTTCCCTGTCAA,ENSG00000134256,CD101
135,C1046,anti-human CD88 (C5aR),S5/1,GCCGCATGAGAAACA,ENSG00000197405,C5AR1
136,C1052,anti-human CD224,KF29,CTGATGAGATGTCAG,ENSG00000100031,GGT1
137,C0951,PE Streptavidin,,AACCTTTGCCACTGC,,
138,,Biotinylated Recombinant SARS-CoV-2 S Protein ...,,used with PE-Streptavidin,,


In [35]:
absummary['Name2'] = absummary['Description'].apply(lambda x: x.replace('anti-human ', '').replace('anti-Human ', '').replace(', κ Isotype Ctrl', '_K_Iso').split(' (')[0])
absummary['Name2'] = absummary['Name2'].apply(lambda x: x.replace('anti-human/mouse ', '').replace('anti-mouse/human ', '').replace('anti-human/mouse/rat ', ''))
absummary['Name2'] = absummary['Name2'].apply(lambda x: x.split(' ')[0])
absummary['Name2'] = absummary['Name2'].apply(lambda x: x.replace(' ', '_'))
absummary.head()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2
0,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86,CD86
1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274,CD274
2,C0020,"anti-human CD270 (HVEM, TR2)",122,TGATAGAAACAGACC,ENSG00000157873,TNFRSF14,CD270
3,C0023,anti-human CD155 (PVR),SKII.4,ATCACATCGTTGCCA,ENSG00000073008,PVR,CD155
4,C0024,anti-human CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,ENSG00000130202,NECTIN2,CD112


In [107]:
absummary['Final_Name']=None


In [98]:
absummary.tail()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name
134,C0944,anti-human CD101 (BB27),BB27,CTACTTCCCTGTCAA,ENSG00000134256,CD101,CD101,
135,C1046,anti-human CD88 (C5aR),S5/1,GCCGCATGAGAAACA,ENSG00000197405,C5AR1,CD88,
136,C1052,anti-human CD224,KF29,CTGATGAGATGTCAG,ENSG00000100031,GGT1,CD224,
137,C0951,PE Streptavidin,,AACCTTTGCCACTGC,,,PE,
138,,Biotinylated Recombinant SARS-CoV-2 S Protein ...,,used with PE-Streptavidin,,,Biotinylated,


In [108]:
fillin = {
    'Armenian Hamster IgG Isotype Ctrl' : 'AB_ArmenianHamsterIgGiso',
    'anti-human FcεRIα' : 'AB_Fc-RI-',
    'anti-human HLA-A,B,C' : 'AB_HLA-ABC',
    'Mouse IgG1, κ isotype Ctrl' : 'AB_IgG1kiso',
    'Mouse IgG2a, κ isotype Ctrl' : 'AB_IgG2akiso',
    'Mouse IgG2b, κ isotype Ctrl' : 'AB_IgG2bkiso',
    'anti-human Ig light chain κ' : 'AB_IgKLightChain',
    'anti-human Ig light chain λ' : 'AB_IglightchainLamda',
    'Rat IgG1, κ isotype Ctrl' : 'AB_RatIgG1kiso',
    'Rat IgG2a, κ Isotype Ctrl' : 'AB_RatIgG2akiso',
    'Rat IgG2b, κ Isotype Ctrl' : 'AB_RatIgG2bkIso',
    'PE Streptavidin' : 'AB_S1probe',
    'anti-human TCR Vα7.2' : 'AB_TCRValpha7p2',
    'anti-human TCR Vδ2' : 'AB_TCRVdelta2',
    'anti-human TCR α/β' : 'AB_TCRab',
    'anti-human/mouse integrin β7' : 'AB_integrinBeta7',
}

In [109]:
absummary['Final_Name'] = absummary['Description'].map(fillin)

In [110]:
absummary['Final_Name']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
134           NaN
135           NaN
136           NaN
137    AB_S1probe
138           NaN
Name: Final_Name, Length: 139, dtype: object

In [None]:
set(sparks_var_adt.index).difference(absummary['Final_Name'])

In [111]:
ind = absummary['Final_Name'].isna()
absummary.loc[ind, 'Final_Name'] = absummary.loc[ind, 'Name2']

ind = absummary['Final_Name'].apply(lambda x: x[:3] != 'AB_')
absummary.loc[ind, 'Final_Name'] = 'AB_' + absummary.loc[ind, 'Final_Name']

In [112]:
absummary['Final_Name']

0              AB_CD86
1             AB_CD274
2             AB_CD270
3             AB_CD155
4             AB_CD112
            ...       
134           AB_CD101
135            AB_CD88
136           AB_CD224
137         AB_S1probe
138    AB_Biotinylated
Name: Final_Name, Length: 139, dtype: object

In [113]:
absummary[absummary['Final_Name'].isin(set(absummary['Final_Name']).difference(sparks_var_adt.index))]

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name
138,,Biotinylated Recombinant SARS-CoV-2 S Protein ...,,used with PE-Streptavidin,,,Biotinylated,AB_Biotinylated


In [114]:
set(absummary['Final_Name']).difference(sparks_var_adt.index)

{'AB_Biotinylated'}

In [115]:
set(sparks_var_adt.index).difference(absummary['Final_Name'])

set()

In [116]:
absummary[absummary['Final_Name'].duplicated()]

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name


In [118]:
sparks_var_adt.shape, absummary.shape

((138, 2), (139, 8))

In [126]:
absummary_filt = absummary[-absummary['DNA_ID'].isna()].copy()

In [127]:
absummary_filt.shape

(138, 8)

In [128]:
absummary_filt.head()

Unnamed: 0,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name
0,C0006,anti-human CD86,IT2.2,GTCTTTGTCAGTGCA,ENSG00000114013,CD86,CD86,AB_CD86
1,C0007,"anti-human CD274 (B7-H1, PD-L1)",29E.2A3,GTTGTCCGACAATAC,ENSG00000120217,CD274,CD274,AB_CD274
2,C0020,"anti-human CD270 (HVEM, TR2)",122,TGATAGAAACAGACC,ENSG00000157873,TNFRSF14,CD270,AB_CD270
3,C0023,anti-human CD155 (PVR),SKII.4,ATCACATCGTTGCCA,ENSG00000073008,PVR,CD155,AB_CD155
4,C0024,anti-human CD112 (Nectin-2),TX31,AACCTTCCGTCTAAG,ENSG00000130202,NECTIN2,CD112,AB_CD112


In [130]:
sparks_var_adt

Unnamed: 0,gene_ids,feature_types
AB_CD29,AB_CD29,Antibody Capture
AB_CD11a,AB_CD11a,Antibody Capture
AB_CD44,AB_CD44,Antibody Capture
AB_HLA-ABC,AB_HLA-ABC,Antibody Capture
AB_CD18,AB_CD18,Antibody Capture
...,...,...
AB_CD57,AB_CD57,Antibody Capture
AB_CD303,AB_CD303,Antibody Capture
AB_CD226,AB_CD226,Antibody Capture
AB_CD169,AB_CD169,Antibody Capture


In [132]:
sparks_var_adt_fullinfo = pd.merge(left=sparks_var_adt, right=absummary_filt, left_index=True, 
                                   right_on='Final_Name', how='left')
sparks_var_adt_fullinfo.head()

Unnamed: 0,gene_ids,feature_types,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name
98,AB_CD29,Antibody Capture,C0369,anti-human CD29,TS2/16,GTATTCCCTCAGTCA,ENSG00000150093,ITGB1,CD29,AB_CD29
73,AB_CD11a,Antibody Capture,C0185,anti-human CD11a,TS2/4,TATATCCTTGTGAGC,ENSG00000005844,ITGAL,CD11a,AB_CD11a
24,AB_CD44,Antibody Capture,C0073,anti-mouse/human CD44,IM7,TGGCTTCAGGTCCTA,ENSG00000026508,CD44,CD44,AB_CD44
16,AB_HLA-ABC,Antibody Capture,C0058,"anti-human HLA-A,B,C",W6/32,TATGCGAGGCTTATC,ENSG00000206503,HLA-A,"HLA-A,B,C",AB_HLA-ABC
103,AB_CD18,Antibody Capture,C0385,anti-human CD18,TS1/18,TATTGGGACACTTCT,ENSG00000160255,ITGB2,CD18,AB_CD18


In [133]:
sparks_var_adt_fullinfo.index = sparks_var_adt_fullinfo['Final_Name']
sparks_var_adt_fullinfo['Final_Target'] = sparks_var_adt_fullinfo['Final_Name'].str.replace('AB_', '')

In [185]:
sparks_var_adt_fullinfo.shape, sparks_var_adt_fullinfo.shape

((138, 11), (138, 11))

In [186]:
sparks_var_adt_fullinfo.head()

Unnamed: 0_level_0,gene_ids,feature_types,DNA_ID,Description,Clone,Barcode,Ensemble ID,Gene name,Name2,Final_Name,Final_Target
Final_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AB_CD29,AB_CD29,Antibody Capture,C0369,anti-human CD29,TS2/16,GTATTCCCTCAGTCA,ENSG00000150093,ITGB1,CD29,AB_CD29,CD29
AB_CD11a,AB_CD11a,Antibody Capture,C0185,anti-human CD11a,TS2/4,TATATCCTTGTGAGC,ENSG00000005844,ITGAL,CD11a,AB_CD11a,CD11a
AB_CD44,AB_CD44,Antibody Capture,C0073,anti-mouse/human CD44,IM7,TGGCTTCAGGTCCTA,ENSG00000026508,CD44,CD44,AB_CD44,CD44
AB_HLA-ABC,AB_HLA-ABC,Antibody Capture,C0058,"anti-human HLA-A,B,C",W6/32,TATGCGAGGCTTATC,ENSG00000206503,HLA-A,"HLA-A,B,C",AB_HLA-ABC,HLA-ABC
AB_CD18,AB_CD18,Antibody Capture,C0385,anti-human CD18,TS1/18,TATTGGGACACTTCT,ENSG00000160255,ITGB2,CD18,AB_CD18,CD18


In [187]:
sparks_var_adt_fullinfo.to_csv('../../Data/PerDataset/Sparks2023/sparks_adt_features.tsv', sep='\t')

## Merge ADT labels on clone ID

In [188]:
tbru_adt_info = pd.read_csv('../../Data/PerDataset/TBRU/tbru_adt_info_merged.tsv', sep='\t', index_col=0)
tbru_adt_info = tbru_adt_info[['Dataset_Name', 'Supp_Clone', 'Target']]
tbru_adt_info.columns = ['Original', 'Clone', 'Target']

In [189]:
combat_adt_info = pd.read_csv('../../Data/PerDataset/COMBAT/Combat_ADT_info_final.tsv', sep='\t')
combat_adt_info = combat_adt_info[['Name', 'Clone', 'Name']] 
combat_adt_info.columns = ['Original', 'Clone', 'Target']
combat_adt_info['Target'] = combat_adt_info['Target'].apply(lambda x: x[3:])

In [190]:
ampra_adt_info = pd.read_csv('../../Data/PerDataset/AMPRA/ampra_adt_features.tsv', sep='\t')
ampra_adt_info = ampra_adt_info[['_index', 'Clone', 'ADT_Name']]
ampra_adt_info.columns = ['Original', 'Clone', 'Target']
ampra_adt_info['Target'] = ampra_adt_info['Target'].apply(lambda x: x[:-5])

In [191]:
ukcovid_adt_info = pd.read_csv('../../Data/PerDataset/UKCOVID/haniffa21.RNAandADT.WNN.reprocessedDAK4.ADTinfo.tsv', sep='\t')
ukcovid_adt_info = ukcovid_adt_info[['Name', 'Clone', 'Name']]
ukcovid_adt_info.columns = ['Original', 'Clone', 'Target']
ukcovid_adt_info['Target'] = ukcovid_adt_info['Target'].apply(lambda x: x[3:])

In [192]:
hao_adt_info= pd.read_csv('../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.ADTinfo.tsv', sep='\t')
hao_adt_info = hao_adt_info[['features', 'Clone', '#protein']]
hao_adt_info.columns = ['Original', 'Clone', 'Target']

In [193]:
sparks_adt_info= pd.read_csv('../../Data/PerDataset/Sparks2023/sparks_adt_features.tsv', sep='\t')
sparks_adt_info = sparks_adt_info[['Final_Name', 'Clone', 'Final_Target']]
sparks_adt_info.columns = ['Original', 'Clone', 'Target']

In [194]:
sparks_adt_info.tail(2)

Unnamed: 0,Original,Clone,Target
136,AB_CD169,7-239,CD169
137,AB_S1probe,,S1probe


In [199]:
sparks_adt_info = sparks_adt_info.loc[sparks_adt_info['Target']!= 'S1probe', :]

In [200]:
sparks_adt_info.tail(2)

Unnamed: 0,Original,Clone,Target
135,AB_CD226,11A8,CD226
136,AB_CD169,7-239,CD169


In [190]:
hao_adt_info.loc[hao_adt_info['Clone']=='BA5b', :]

Unnamed: 0,Original,Clone,Target
103,AB_CD26-2,BA5b,CD26_2
105,AB_CD26-1,BA5b,CD26_1


In [197]:
hao_adt_info = hao_adt_info.loc[hao_adt_info['Target']!= 'CD26_2', :]

In [201]:
merge = pd.merge(left=hao_adt_info.rename(columns={'Original':'Hao_Original', 'Target':'Hao_Target'}),
         right=combat_adt_info.rename(columns={'Original':'COMBAT_Original', 'Target':'COMBAT_Target'}),
         on='Clone', how='outer')


merge2 = pd.merge(left=merge,
         right=ukcovid_adt_info.rename(columns={'Original':'UKCOVID_Original', 'Target':'UKCOVID_Target'}),
         on='Clone', how='outer')

merge3 = pd.merge(left=merge2,
         right=ampra_adt_info.rename(columns={'Original':'AMPRA_Original', 'Target':'AMPRA_Target'}),
         on='Clone', how='outer')


merge4 = pd.merge(left=merge3,
         right=tbru_adt_info.rename(columns={'Original':'TBRU_Original', 'Target':'TBRU_Target'}),
         on='Clone', how='outer')


merge5 = pd.merge(left=merge4,
         right=sparks_adt_info.rename(columns={'Original':'Sparks_Original', 'Target':'Sparks_Target'}),
         on='Clone', how='outer')


In [202]:
merge5.index = merge5['Clone']
merge5['Final_Name'] = ''
merge5['Final_Target'] = ''
for i in merge5.index:
    for c in ['Hao_Original', 'COMBAT_Original', 'UKCOVID_Original', 'AMPRA_Original', 'TBRU_Original',
             'Sparks_Original']:
        if type(merge5.at[i, c]) is str:
            merge5.at[i, 'Final_Name'] = merge5.at[i, c]
            merge5.at[i, 'Final_Target'] = merge5.at[i, c.replace('Original', 'Target')]
            break

In [203]:
merge5.index

Index(['A1', 'RTK2071', 'H4A3', 'AK4', 'IP26', 'BY88', 'WM59', '581', 'E11',
       '5-271',
       ...
       'T5-39', 'JD3', '63D3', 'LG.3A10', 'AY13', '3D12', 'RTK2758', 'HTK888',
       'NT-7', 'CG4'],
      dtype='object', name='Clone', length=287)

In [204]:
merge5 = merge5.sort_values(by='Final_Target')

In [205]:
merge5.drop('Clone', axis=1).to_csv('../../Data/PerDataset/Merged_ADT_Info_withSparks_20240521.tsv', 
                                    sep='\t')

In [300]:
merged_adt_info_cleaned_orig = pd.read_csv('../../Data/PerDataset/Merged_ADT_Info_Cleaned.tsv', sep='\t')
merged_adt_info_cleaned_orig.head()

Unnamed: 0,Clone,Hao_Original,Hao_Target,COMBAT_Original,COMBAT_Target,UKCOVID_Original,UKCOVID_Target,AMPRA_Original,AMPRA_Target,TBRU_Original,TBRU_Target,Final_Name,Final_Target,Final_Target_Collapsed
0,MIH43,AB_B7-H4,B7-H4,AB_B7_H4,B7_H4,AB_B7-H4,B7-H4,,,,,AB_B7-H4,B7-H4,B7-H4
1,1D9-M12,AB_C5L2,C5L2,,,,,,,,,AB_C5L2,C5L2,C5L2
2,6588-5,AB_CCR10,CCR10,,,,,,,,,AB_CCR10,CCR10,CCR10
3,HI10a,,,AB_CD10,CD10,AB_MME,MME,,,,,AB_CD10,CD10,CD10
4,BB27,,,AB_CD101_BB27,CD101_BB27,AB_CD101,CD101,,,,,AB_CD101,CD101,CD101


In [345]:
merged_adt_info = pd.read_csv('../../Data/PerDataset/Merged_ADT_Info_withSparks_20240521.tsv', sep='\t')
merged_adt_info.head()

Unnamed: 0,Clone,Hao_Original,Hao_Target,COMBAT_Original,COMBAT_Target,UKCOVID_Original,UKCOVID_Target,AMPRA_Original,AMPRA_Target,TBRU_Original,TBRU_Target,Sparks_Original,Sparks_Target,Final_Name,Final_Target
0,HTK888,,,,,,,,,,,AB_ArmenianHamsterIgGiso,ArmenianHamsterIgGiso,AB_ArmenianHamsterIgGiso,ArmenianHamsterIgGiso
1,MIH43,AB_B7-H4,B7-H4,AB_B7_H4,B7_H4,AB_B7-H4,B7-H4,,,,,,,AB_B7-H4,B7-H4
2,1D9-M12,AB_C5L2,C5L2,,,,,,,,,,,AB_C5L2,C5L2
3,6588-5,AB_CCR10,CCR10,,,,,,,,,,,AB_CCR10,CCR10
4,HI10a,,,AB_CD10,CD10,AB_MME,MME,,,,,,,AB_CD10,CD10


In [308]:
set(merged_adt_info_cleaned_orig['Clone']).difference(merged_adt_info['Clone'])

{'5.00E+08', '5.00E+10', '9.00E+02'}

In [314]:
merged_adt_info_cleaned_orig['Clone'] = merged_adt_info_cleaned_orig['Clone'].replace({
    '5.00E+08':'5E8',
    '5.00E+10':'5E10',
    '9.00E+02':'9E2'
})

In [315]:
set(merged_adt_info_cleaned_orig['Clone']).difference(merged_adt_info['Clone'])

set()

In [321]:
merged_adt_info_cleaned_orig[merged_adt_info_cleaned_orig['Final_Target']!=merged_adt_info_cleaned_orig['Final_Target_Collapsed']]

Unnamed: 0,Clone,Hao_Original,Hao_Target,COMBAT_Original,COMBAT_Target,UKCOVID_Original,UKCOVID_Target,AMPRA_Original,AMPRA_Target,TBRU_Original,TBRU_Target,Final_Name,Final_Target,Final_Target_Collapsed
18,M1/70,AB_CD11b-1,CD11b_1,AB_humanCD11b,humanCD11b,AB_ITGAM,ITGAM,,,,,AB_CD11b-1,CD11b_1,CD11b
19,ICRF44,AB_CD11b-2,CD11b_2,,,,,CD11b-prot,CD11b,,,AB_CD11b-2,CD11b_2,CD11b
27,clone 7,AB_CD133-1,CD133_1,,,,,,,,,AB_CD133-1,CD133_1,CD133
28,S16016B,AB_CD133-2,CD133_2,AB_CD133,CD133,AB_PROM1,PROM1,,,,,AB_CD133-2,CD133_2,CD133
33,MI15,AB_CD138-1,CD138_1,,,,,,,,,AB_CD138-1,CD138_1,CD138
34,DL-101,AB_CD138-2,CD138_2,AB_CD138_Syndecan_1,CD138_Syndecan_1,AB_SDC1,SDC1,,,,,AB_CD138-2,CD138_2,CD138
94,TX25,AB_CD226,CD226,,,,,CD226/DNAM-1(11A8)-prot,CD226/DNAM-1(11A8),,,AB_CD226-1,CD226_1,CD226
95,11A8,,,AB_CD226_DNAM_1,CD226_DNAM_1,AB_CD226,CD226,,,,,AB_CD226-2,CD226_2,CD226
112,O323,AB_CD27,CD27,AB_CD27,CD27,AB_CD27,CD27,,,CD27_protein,CD27,AB_CD27-1,CD27_1,CD27
113,LG.3A10,,,,,,,CD27(LG.3A10)-prot,CD27(LG.3A10),,,AB_CD27-2,CD27_2,CD27


In [341]:
merged_adt_info_cleaned = merged_adt_info.copy()

# Update using cleaned names
for c in merged_adt_info_cleaned_orig['Clone']:
    print(c)
    merged_adt_info_cleaned.loc[merged_adt_info_cleaned['Clone']==c, 
                ['Final_Name', 'Final_Target']] = merged_adt_info_cleaned_orig.loc[merged_adt_info_cleaned_orig['Clone']==c, 
                                 ['Final_Name', 'Final_Target']].values

MIH43
1D9-M12
6588-5
HI10a
BB27
CBR-IC2/2
Ber-ACT8
43A3
STA
H4A3
W7C5
S16017E
TX31
9-4D2-1E4
104D2
GIR-208
TS2/4
m24
M1/70
ICRF44
S-HCL-3
TU27
6H6
G077F6
UV4
A019D5
WM15
clone 7
S16016B
Ber-ACT35 (ACT35)
BV10A4H2
4B4-1
5F4
MI15
DL-101
63D3
M5E2
16A1
18A2
M80
NY2
BV9
P1H12
W6D3
A12 (7D4)
BNI3
24-31
SKII.4
HP-MA4
DX27
DX9
UP-R1
3G8
HP-3G10
GHI/61
67D2
7-239
L1-OV198.5
15-414
MEM-166
NOK-1
TS1/18
G025H7
12G5
J252D4
K041E5
HIB19
K036C2
500000000
5E8
L291H4
J418F1
G034E3
G043H7
L053E8
HI149
L161
51.1
TS1/8
2H7
OX-104
RCR-401
33.1 (Ab33)
NP4D6
7C9C20
HD30
15-2
4C7
1000
9E9A8
Bu32
S-HCL-1
11C3C65
KF29
TX25
11A8
EBVCS-5
HI264
HIR2
ML5
4E3.16
C1.7
BC96
11C3.1
RIK-2
MIH24
1D6
T5-39
1A1
11C1
19F2
BA5b
O323
LG.3A10
122
ME20.4
MIH26
24F.10C12
29E.2A3
2D3
9F.8A4
C398.4A
EH12.2H7
CD28.2
HTA125
TS2/16
BM16
BY88
H037G3
201A
12C2
NKTA255
H5/FcRL3
413D12
509f6
7D4-6
WM59
1D11
162.1
FUN-2
67A4
8C11
9C4
6-434
P67.6
900
9E2
P44-8
P30-15
5D3
581
24D2
E11
TREM-26
108-17
5-271
4B2.9
F38-2E2
8F9
HIT2
HB-7
A1
UC

In [346]:
merged_adt_info_cleaned.drop('Clone', axis=1).to_csv('../../Data/PerDataset/Merged_ADT_Info_Cleaned_withSparks_20240521.tsv', 
                                    sep='\t')

In [347]:
overlap = set(merged_adt_info_cleaned['Clone']).intersection(merged_adt_info_cleaned_orig['Clone'])

orig = merged_adt_info_cleaned_orig[merged_adt_info_cleaned_orig['Clone'].isin(overlap)].drop(['Final_Target_Collapsed'], axis = 1).reset_index(drop=True).copy()
orig.index = orig['Clone']

new = (merged_adt_info_cleaned[merged_adt_info_cleaned['Clone'].isin(overlap)].drop(['Sparks_Original', 'Sparks_Target'], axis = 1)).reset_index(drop=True)
new.index = new['Clone']

In [351]:
orig.loc[overlap, :].equals(new.loc[overlap, :])

True

In [356]:
sorted(merged_adt_info_cleaned['Final_Name'])

['AB_ArmenianHamsterIgGiso',
 'AB_B7-H4',
 'AB_C5L2',
 'AB_CCR10',
 'AB_CCR3',
 'AB_CCR7',
 'AB_CD10',
 'AB_CD101',
 'AB_CD102',
 'AB_CD103',
 'AB_CD105',
 'AB_CD106',
 'AB_CD107a',
 'AB_CD109',
 'AB_CD110',
 'AB_CD112',
 'AB_CD115',
 'AB_CD117',
 'AB_CD119',
 'AB_CD11a',
 'AB_CD11a',
 'AB_CD11b-1',
 'AB_CD11b-2',
 'AB_CD11c',
 'AB_CD122',
 'AB_CD123',
 'AB_CD124',
 'AB_CD126',
 'AB_CD127',
 'AB_CD13',
 'AB_CD133-1',
 'AB_CD133-2',
 'AB_CD134',
 'AB_CD135',
 'AB_CD137',
 'AB_CD137L',
 'AB_CD138-1',
 'AB_CD138-2',
 'AB_CD14-1',
 'AB_CD14-2',
 'AB_CD140a',
 'AB_CD140b',
 'AB_CD141',
 'AB_CD142',
 'AB_CD144',
 'AB_CD146',
 'AB_CD15',
 'AB_CD150',
 'AB_CD152',
 'AB_CD154',
 'AB_CD155',
 'AB_CD158',
 'AB_CD158b',
 'AB_CD158e1',
 'AB_CD158f',
 'AB_CD16',
 'AB_CD161',
 'AB_CD163',
 'AB_CD164',
 'AB_CD169',
 'AB_CD171',
 'AB_CD172a',
 'AB_CD177',
 'AB_CD178',
 'AB_CD18',
 'AB_CD184',
 'AB_CD185',
 'AB_CD186',
 'AB_CD19',
 'AB_CD192',
 'AB_CD193',
 'AB_CD194',
 'AB_CD195',
 'AB_CD196',
 'AB_CD1

## Output final raw datasets with info appended

### Hao

In [313]:
mergefn = "../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.h5ad" # AnnData containing RNA +/- ADT
adata = sc.read(mergefn)
adata

Only considering the two last: ['.T', '.h5ad'].
Only considering the two last: ['.T', '.h5ad'].


AnnData object with n_obs × n_vars = 73259 × 20957
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'celltype.l3.merged'
    var: 'features', 'Clone', 'Specificity', 'feature_types'
    uns: 'neighbors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'

In [314]:
adata = sc.AnnData(adata.raw.X, obs=adata.obs, var=adata.raw.var)

In [315]:
ind = ~merged_adt_info_cleaned['Hao_Original'].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, 'Hao_Original'], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, 'Hao_Original'], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, 'Hao_Original'], merged_adt_info_cleaned.loc[ind, 'Clone']))
adata.var['Name_ADT_Fixed'] = adata.var['features'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['features'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['features'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed']

In [316]:
adata.var['Name_ADT_Fixed'].value_counts().head()

AL627309.1    1
HSBP1L1       1
FAM110A       1
SRXN1         1
CSNK2A1       1
Name: Name_ADT_Fixed, dtype: int64

In [317]:
adata.var

Unnamed: 0_level_0,features,Clone,Specificity,feature_types,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AL627309.1,AL627309.1,Not_antibody,Not_antibody,Gene Expression,AL627309.1,AL627309.1,AL627309.1
AL669831.5,AL669831.5,Not_antibody,Not_antibody,Gene Expression,AL669831.5,AL669831.5,AL669831.5
LINC00115,LINC00115,Not_antibody,Not_antibody,Gene Expression,LINC00115,LINC00115,LINC00115
FAM41C,FAM41C,Not_antibody,Not_antibody,Gene Expression,FAM41C,FAM41C,FAM41C
NOC2L,NOC2L,Not_antibody,Not_antibody,Gene Expression,NOC2L,NOC2L,NOC2L
...,...,...,...,...,...,...,...
AB_CD164,AB_CD164,67D2,CD164,Antibody Capture,AB_CD164,CD164,67D2
AB_CD138-2,AB_CD138-2,DL-101,CD138 (Syndecan-1),Antibody Capture,AB_CD138-2,CD138_2,DL-101
AB_CD144,AB_CD144,BV9,CD144 (VE-cadherin),Antibody Capture,AB_CD144,CD144,BV9
AB_CD202b,AB_CD202b,33.1 (Ab33),CD202b (Tie2/Tek),Antibody Capture,AB_CD202b,CD202b,33.1 (Ab33)


In [270]:
sc.write( "../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.raw.ADTfixed.h5ad", adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### Hao et al - All cells

In [2]:
withfix = sc.read( "../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.T.raw.ADTfixed.h5ad")

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


In [3]:
adata = sc.read('../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.h5ad')

In [24]:
adata.var.loc[adata.var['feature_types']=='Antibody Capture', :].head()

Unnamed: 0,features,Clone,Specificity,feature_types
AB_CD39,AB_CD39,A1,CD39,Antibody Capture
AB_Rat-IgG1-1,AB_Rat-IgG1-1,RTK2071,"Rat IgG1, Œ∫ Isotype Control",Antibody Capture
AB_CD107a,AB_CD107a,H4A3,CD107a (LAMP-1),Antibody Capture
AB_CD62P,AB_CD62P,AK4,CD62P (P-Selectin),Antibody Capture
AB_TCR-2,AB_TCR-2,IP26,TCR a/Œ≤,Antibody Capture


In [35]:
newvar = pd.merge(left=adata.var[['features', 'feature_types']],
                  right=withfix.var.loc[withfix.var['feature_types'] == 'Antibody Capture', :].drop('feature_types', axis=1),
                  on='features', how='left')
ind = newvar['feature_types']=='Gene Expression'
newvar.loc[ind, 'Specificity'] = 'Not_antibody'
newvar.loc[ind, 'Clone'] = 'Not_antibody'
newvar.loc[ind, 'Name_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.loc[ind, 'Gene_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.loc[ind, 'Clone_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.index = newvar['Name_ADT_Fixed']
print((adata.var['features'] == newvar['features'].values).value_counts())
adata.var = newvar

True    20957
Name: features, dtype: int64


In [38]:
newvar = pd.merge(left=adata.raw.var[['features', 'feature_types']],
                  right=withfix.var.loc[withfix.var['feature_types'] == 'Antibody Capture', :].drop('feature_types', axis=1),
                  on='features', how='left')
ind = newvar['feature_types']=='Gene Expression'
newvar.loc[ind, 'Specificity'] = 'Not_antibody'
newvar.loc[ind, 'Clone'] = 'Not_antibody'
newvar.loc[ind, 'Name_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.loc[ind, 'Gene_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.loc[ind, 'Clone_ADT_Fixed'] = newvar.loc[ind, 'features']
newvar.index = newvar['Name_ADT_Fixed']
print((adata.raw.var['features'] == newvar['features'].values).value_counts())

True    20957
Name: features, dtype: int64


In [41]:
adata.raw = sc.AnnData(X=adata.raw.X, obs=adata.obs, var=newvar)

In [42]:
sc.write('../../Data/PerDataset/HaoEtAl/haoetal_pbmc_multimodal.merged.ADTfixed.h5ad', adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### Stephenson

In [293]:
mergefn = '../../Data/PerDataset/UKCOVID/haniffa21.RNAandADT.WNN.reprocessedDAK4.h5ad' 
adata = sc.read(mergefn)

Only considering the two last: ['.reprocessedDAK4', '.h5ad'].
Only considering the two last: ['.reprocessedDAK4', '.h5ad'].


In [294]:
adata = sc.AnnData(adata.raw.X, obs=adata.obs, var=adata.raw.var)

In [295]:
adata.var['Name'] = adata.var.index

In [297]:
dataset = 'UKCOVID'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))
adata.var['Name_ADT_Fixed'] = adata.var['Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [298]:
adata.var.tail(50)

Unnamed: 0_level_0,feature_types,ADT_MI,ADT_MI_Rank,IEG_Filter,IEG_GEP_Score,Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB_CD1a,Antibody Capture,0.008955,170.0,False,,AB_CD1a,AB_CD1a,CD1a,HI149
AB_CD304,Antibody Capture,0.028073,110.0,False,,AB_CD304,AB_CD304,CD304,12C2
AB_CD36,Antibody Capture,0.17768,30.0,False,,AB_CD36,AB_CD36,CD36,5-271
AB_CD158,Antibody Capture,0.046558,91.0,False,,AB_CD158,AB_CD158,CD158,HP-MA4
AB_CD207,Antibody Capture,0.008945,171.0,False,,AB_langerin,AB_CD207,CD207,4C7
AB_CD49d,Antibody Capture,0.040196,98.0,False,,AB_ITGA4,AB_CD49d,CD49d,9F10
AB_CD73,Antibody Capture,0.091376,58.0,False,,AB_NT5E,AB_CD73,CD73,AD2
AB_TCR-V-7.2,Antibody Capture,0.031666,105.0,False,,AB_TCR_Va7.2,AB_TCR-V-7.2,TCR_V_7.2,3C10
AB_TCR-Vd2,Antibody Capture,0.064227,73.0,False,,AB_TCR_Vg2,AB_TCR-Vd2,TCR_Vd2,B6
AB_TCR-V-9,Antibody Capture,0.021906,116.0,False,,AB_TCR_Vg9,AB_TCR-V-9,TCR_V_9,B3


In [299]:
mergefn = '../../Data/PerDataset/UKCOVID/haniffa21.RNAandADT.raw.ADTfixed.h5ad' 
sc.write(mergefn, adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### TBRU

In [300]:
adata = sc.read('../../Data/PerDataset/TBRU/tbru_exprs_raw.h5ad')

In [301]:
adata.var['Name'] = adata.var.index

In [302]:
dataset = 'TBRU'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))
adata.var['Name_ADT_Fixed'] = adata.var['Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [303]:
adata.var

Unnamed: 0_level_0,Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A,FAM138A,FAM138A,FAM138A
OR4F5,OR4F5,OR4F5,OR4F5,OR4F5
AL627309.1,AL627309.1,AL627309.1,AL627309.1,AL627309.1
AL627309.3,AL627309.3,AL627309.3,AL627309.3,AL627309.3
...,...,...,...,...
AB_IgG1isotypctrl,MouseIgG_protein,AB_IgG1isotypctrl,IgG1isotypctrl,MOPC-21
AB_CD8a,CD8a_protein,AB_CD8a,CD8a,RPA-T8
AB_abTCR,TCRab_protein,AB_abTCR,abTCR,IP26
AB_CD48,CD48/SLAMF2_protein,AB_CD48,CD48,BJ40


In [304]:
mergefn = '../../Data/PerDataset/TBRU/tbru_exprs_raw.ADTfixed.h5ad'
sc.write(mergefn, adata)

### AMPRA

In [331]:
mergefn = '../../Data/PerDataset/AMPRA/AMP_ADT.RNA_counts.h5ad'
adata = sc.read(mergefn)

In [334]:
adata.raw.X.sum(axis=1) != adata.X.sum(axis=1)

matrix([[False],
        [False],
        [False],
        ...,
        [False],
        [False],
        [False]])

In [335]:
adata.raw = None

In [336]:
adata.var
dataset = 'AMPRA'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))
adata.var['Name_ADT_Fixed'] = adata.var['_index'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['_index'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['_index'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [337]:
adata.var['feature_types'] = adata.var['_index'].apply(lambda x: '-prot' in x).replace({True:'Antibody Capture', False:'Gene Expression'})

In [338]:
adata.var = adata.var.rename(columns={'_index':'Original_Name'})

In [339]:
adata.var.tail()

Unnamed: 0_level_0,Original_Name,features,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed,feature_types
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AB_Folate,FR-beta-prot,FR-beta-prot,AB_Folate,Folate,94b/FOLR2,Antibody Capture
AB_HLA-DR,HLA-DR-prot,HLA-DR-prot,AB_HLA-DR,HLA-DR,L243,Antibody Capture
AB_IgGFc,IgG-Fc-prot,IgG-Fc-prot,AB_IgGFc,IgGFc,M1310G05,Antibody Capture
AB_IgM,IgM-prot,IgM-prot,AB_IgM,IgM,MHM-88,Antibody Capture
AB_Podoplanin,Podoplanin-prot,Podoplanin-prot,AB_Podoplanin,Podoplanin,NC-08,Antibody Capture


In [340]:
mergefn = '../../Data/PerDataset/AMPRA/AMP_ADT.RNA_counts.raw.ADTfixed.h5ad'
sc.write(mergefn, adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### COMBAT

In [348]:
mergefn = '../../Data/PerDataset/COMBAT/COMBAT-CITESeq-DATA.Raw.T.h5ad' # AnnData containing RNA +/- ADT
adata = sc.read(mergefn)

Only considering the two last: ['.T', '.h5ad'].
Only considering the two last: ['.T', '.h5ad'].


In [349]:
adata.X.sum(axis=1)

matrix([[5423.],
        [5033.],
        [4577.],
        ...,
        [7431.],
        [4796.],
        [1499.]], dtype=float32)

In [351]:
dataset = 'COMBAT'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))

In [353]:
adata.var['Original_Name'] = adata.var.index

In [354]:
adata.var['Name_ADT_Fixed'] = adata.var['Original_Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Original_Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Original_Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [356]:
adata.var.tail(50)

Unnamed: 0_level_0,gene_ids,feature_types,Original_Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AB_CD1a,C0402,Antibody Capture,AB_CD1a,AB_CD1a,CD1a,HI149
AB_CD304,C0406,Antibody Capture,AB_CD304_Neuropilin_1,AB_CD304,CD304,12C2
AB_CD36,C0407,Antibody Capture,AB_CD36,AB_CD36,CD36,5-271
AB_CD158,C0420,Antibody Capture,AB_CD158_KIR2DL1_S1_S3_S5,AB_CD158,CD158,HP-MA4
AB_CD207,C0437,Antibody Capture,AB_CD207,AB_CD207,CD207,4C7
AB_CD49d,C0576,Antibody Capture,AB_CD49d,AB_CD49d,CD49d,9F10
AB_CD73,C0577,Antibody Capture,AB_CD73,AB_CD73,CD73,AD2
AB_TCR-V-7.2,C0581,Antibody Capture,AB_TCR_Va7_2,AB_TCR-V-7.2,TCR_V_7.2,3C10
AB_TCR-Vd2,C0582,Antibody Capture,AB_TCR_Vd2,AB_TCR-Vd2,TCR_Vd2,B6
AB_TCR-V-9,C0583,Antibody Capture,AB_TCR_Vg9,AB_TCR-V-9,TCR_V_9,B3


In [358]:
mergefn = '../../Data/PerDataset/COMBAT/COMBAT-CITESeq-DATA.Raw.T.ADTfixed.h5ad' # AnnData containing RNA +/- ADT
sc.write(mergefn, adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### COMBAT - All cells

In [32]:
! ls /data/srlab1/dk718/Tcell_cNMF_Comparison/Data/

AMPRA
COMBAT_PBMC
COVID_PBMC
CrossDataset_Clustering
Data
HaoEtAl_PBMC
IntegrateATAC
merged_citeseq_ADT_info_20220827.tsv
MergedDataset
MI_analysis
Misc
Opt_Rep_Of_Usage_And_Spec
TCAT_Simulation
TCAT_Simulation_NormTPM
TCAT_Simulationquery_usage_unfiltered.tsv
TCAT_Simulation_scsim1
TCAT_Simulationsim2_query_usage_unfiltered.tsv
Test_Usage_Pvalues
XTissueImmuneAtlas


In [40]:
adata = sc.read('/data/srlab1/dk718/Tcell_cNMF_Comparison/Data/COMBAT_PBMC/COMBAT-CITESeq-DATA.h5ad')

In [41]:
adata = sc.AnnData(adata.layers['raw'], obs=adata.obs, var=adata.var, uns=adata.uns, obsm=adata.obsm)

In [42]:
adata.var['feature_types'].value_counts()

Gene Expression     20615
Antibody Capture      192
Name: feature_types, dtype: int64

In [43]:
dataset = 'COMBAT'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))

In [44]:
adata.var['Original_Name'] = adata.var.index

In [45]:
adata.var['Name_ADT_Fixed'] = adata.var['Original_Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Original_Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Original_Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [47]:
adata.var.tail(10)

Unnamed: 0_level_0,gene_ids,feature_types,Original_Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AB_CD101,C0944,Antibody Capture,AB_CD101_BB27,AB_CD101,CD101,BB27
AB_CD360,C0985,Antibody Capture,AB_CD360_IL_21R,AB_CD360,CD360,4B2.9
AB_CD88,C1046,Antibody Capture,AB_CD88_C5aR,AB_CD88,CD88,S5/1
AB_HLA_F,C1047,Antibody Capture,AB_HLA_F,AB_HLA_F,HLA_F,3D11/HLA-F
AB_NLRP2,C1048,Antibody Capture,AB_NLRP2,AB_NLRP2,NLRP2,8F10B51
AB_Podocalyxin,C1051,Antibody Capture,AB_Podocalyxin,AB_Podocalyxin,Podocalyxin,mAb 84
AB_CD224,C1052,Antibody Capture,AB_CD224,AB_CD224,CD224,KF29
AB_c_Met,C1055,Antibody Capture,AB_c_Met,AB_c_Met,c_Met,12.1
AB_CD258,C1056,Antibody Capture,AB_CD258_LIGHT,AB_CD258,CD258,T5-39
AB_DR3_TRAMP,C1057,Antibody Capture,AB_DR3_TRAMP,AB_DR3_TRAMP,DR3_TRAMP,JD3


In [48]:
mergefn = '../../Data/PerDataset/COMBAT/COMBAT-CITESeq-DATA.Raw.ADTfixed.h5ad' # AnnData containing RNA +/- ADT
sc.write(mergefn, adata)

Only considering the two last: ['.ADTfixed', '.h5ad'].
Only considering the two last: ['.ADTfixed', '.h5ad'].


### Sparks

In [357]:
mergefn = '../../Data/PerDataset/Sparks2023/T_fromSeurat.h5ad'
adata = sc.read(mergefn)

'../../Data/PerDataset/Sparks2023/T_fromSeurat.h5ad'

In [360]:
merged_adt_info_cleaned

Unnamed: 0,Clone,Hao_Original,Hao_Target,COMBAT_Original,COMBAT_Target,UKCOVID_Original,UKCOVID_Target,AMPRA_Original,AMPRA_Target,TBRU_Original,TBRU_Target,Sparks_Original,Sparks_Target,Final_Name,Final_Target
0,HTK888,,,,,,,,,,,AB_ArmenianHamsterIgGiso,ArmenianHamsterIgGiso,AB_ArmenianHamsterIgGiso,ArmenianHamsterIgGiso
1,MIH43,AB_B7-H4,B7-H4,AB_B7_H4,B7_H4,AB_B7-H4,B7-H4,,,,,,,AB_B7-H4,B7-H4
2,1D9-M12,AB_C5L2,C5L2,,,,,,,,,,,AB_C5L2,C5L2
3,6588-5,AB_CCR10,CCR10,,,,,,,,,,,AB_CCR10,CCR10
4,HI10a,,,AB_CD10,CD10,AB_MME,MME,,,,,,,AB_CD10,CD10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,9D9F9,AB_VEGFR-3,VEGFR-3,,,,,,,,,,,AB_VEGFR-3,VEGFR-3
283,S15046E,AB_XCR1,XCR1,AB_XCR1,XCR1,AB_XCR1,XCR1,,,,,,,AB_XCR1,XCR1
284,12.1,,,AB_c_Met,c_Met,AB_c-Met,c-Met,,,,,,,AB_c_Met,c_Met
285,M3/38,,,AB_humanMac_2_Galectin_3,humanMac_2_Galectin_3,AB_LGALS3,LGALS3,,,,,,,AB_humanMac_2_Galectin_3,humanMac_2_Galectin_3


In [361]:
dataset = 'Sparks'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))

In [362]:
adata.var['Original_Name'] = adata.var.index

In [363]:
adata.var['Name_ADT_Fixed'] = adata.var['Original_Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Original_Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Original_Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [364]:
adata.var.tail(50)

Unnamed: 0_level_0,gene_ids,feature_types,Original_Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AB_CD64,AB_CD64,Antibody Capture,AB_CD64,AB_CD64,CD64,10.1
AB_CD1c,AB_CD1c,Antibody Capture,AB_CD1c,AB_CD1c,CD1c,L161
AB_HLA-DR,AB_HLA-DR,Antibody Capture,AB_HLA-DR,AB_HLA-DR,HLA-DR,L243
AB_CD95,AB_CD95,Antibody Capture,AB_CD95,AB_CD95,CD95,DX2
AB_KLRG1_MAFA,AB_KLRG1,Antibody Capture,AB_KLRG1,AB_KLRG1_MAFA,KLRG1_MAFA,SA231A2
AB_CD69,AB_CD69,Antibody Capture,AB_CD69,AB_CD69,CD69,FN50
AB_CD103,AB_CD103,Antibody Capture,AB_CD103,AB_CD103,CD103,Ber-ACT8
AB_CD196,AB_CD196,Antibody Capture,AB_CD196,AB_CD196,CD196,G034E3
AB_CD32,AB_CD32,Antibody Capture,AB_CD32,AB_CD32,CD32,FUN-2
AB_CD146,AB_CD146,Antibody Capture,AB_CD146,AB_CD146,CD146,P1H12


In [366]:
mergefn = '../../Data/PerDataset/Sparks2023/T_fromSeurat.ADTfixed.h5ad' # AnnData containing RNA +/- ADT
sc.write(mergefn, adata)

### Sparks - All cells

In [367]:
mergefn = '../../Data/PerDataset/Sparks2023/PBMC_fromSeurat.h5ad'
adata = sc.read(mergefn)

In [368]:
dataset = 'Sparks'
ind = ~merged_adt_info_cleaned['%s_Original' % dataset].isnull()
renidx = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Name']))
renspec = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Final_Target']))
renclone = dict(zip(merged_adt_info_cleaned.loc[ind, '%s_Original' % dataset], merged_adt_info_cleaned.loc[ind, 'Clone']))

In [369]:
adata.var['Original_Name'] = adata.var.index

In [370]:
adata.var['Name_ADT_Fixed'] = adata.var['Original_Name'].replace(renidx)
adata.var['Gene_ADT_Fixed'] = adata.var['Original_Name'].replace(renspec)
adata.var['Clone_ADT_Fixed'] = adata.var['Original_Name'].replace(renclone)
adata.var.index = adata.var['Name_ADT_Fixed'] 

In [371]:
adata.var.tail(50)

Unnamed: 0_level_0,gene_ids,feature_types,Original_Name,Name_ADT_Fixed,Gene_ADT_Fixed,Clone_ADT_Fixed
Name_ADT_Fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AB_CD64,AB_CD64,Antibody Capture,AB_CD64,AB_CD64,CD64,10.1
AB_CD1c,AB_CD1c,Antibody Capture,AB_CD1c,AB_CD1c,CD1c,L161
AB_HLA-DR,AB_HLA-DR,Antibody Capture,AB_HLA-DR,AB_HLA-DR,HLA-DR,L243
AB_CD95,AB_CD95,Antibody Capture,AB_CD95,AB_CD95,CD95,DX2
AB_KLRG1_MAFA,AB_KLRG1,Antibody Capture,AB_KLRG1,AB_KLRG1_MAFA,KLRG1_MAFA,SA231A2
AB_CD69,AB_CD69,Antibody Capture,AB_CD69,AB_CD69,CD69,FN50
AB_CD103,AB_CD103,Antibody Capture,AB_CD103,AB_CD103,CD103,Ber-ACT8
AB_CD196,AB_CD196,Antibody Capture,AB_CD196,AB_CD196,CD196,G034E3
AB_CD32,AB_CD32,Antibody Capture,AB_CD32,AB_CD32,CD32,FUN-2
AB_CD146,AB_CD146,Antibody Capture,AB_CD146,AB_CD146,CD146,P1H12


In [372]:
mergefn = '../../Data/PerDataset/Sparks2023/PBMC_fromSeurat.ADTfixed.h5ad' # AnnData containing RNA +/- ADT
sc.write(mergefn, adata)