In [1]:
import pandas as pd
import scanpy as sc
from scipy.io import mmread
import scipy.sparse as sp
import numpy as np

def read_sce(filepath_prefix):
    # Read expression matrix from Matrix Market format
    expr_matrix = mmread(filepath_prefix + "_expression.mtx").T  # Transpose as genes are usually rows in R
    expr_matrix = sp.csr_matrix(expr_matrix)
    
    # Read cell and gene metadata
    cell_meta = pd.read_csv(filepath_prefix + "_cell_metadata.csv", index_col=0)
    gene_meta = pd.read_csv(filepath_prefix + "_gene_metadata.csv", index_col=0)
    
    # Construct AnnData object
    adata = sc.AnnData(X=expr_matrix, obs=cell_meta, var=gene_meta)
    
    return adata

In [2]:
bases = ! ls ../../../Data/PerDataset/Pancancer/IdvDatasets/*_cell_metadata.csv
bases = [x.replace('_cell_metadata.csv', '') for x in bases]
adatas = []
for b in bases:
    adata = read_sce(b)
    adata.var.index = adata.var['display.name']
    #if ('BCL.thisStudy' in b) or ('BRCA.thisStudy' in b) or ('CHOL.thisStudy' in b):
    #    adata.var.index = adata.var['display.name']
    
    adatas.append(adata)    




In [3]:
for i,b in enumerate(bases):
    print(b.split('/')[-1], adatas[i].var.shape[0])


AML.PeterVanGalen2019.CD4 27899
AML.PeterVanGalen2019.CD8 27899
BCC.KathrynEYost2019.CD4 23309
BCC.KathrynEYost2019.CD8 23309
BCL.thisStudy.CD4 28855
BCL.thisStudy.CD8 28855
BRCA.ElhamAzizi2018 10X.CD4 12908
BRCA.ElhamAzizi2018 10X.CD8 12908
BRCA.ElhamAzizi2018 Indrop.CD4 14854
BRCA.ElhamAzizi2018 Indrop.CD8 14854
BRCA.PeterSavas2018.CD4 15623
BRCA.PeterSavas2018.CD8 15623
BRCA.thisStudy.CD4 24148
BRCA.thisStudy.CD8 24148
CHOL.thisStudy.CD4 12582
CHOL.thisStudy.CD8 12582
CRC.LeiZhang2018.CD4 12582
CRC.LeiZhang2018.CD8 12582
CRC.LeiZhang2020 10X.CD4 16452
CRC.LeiZhang2020 10X.CD8 16452
ESCA.thisStudy.CD4 24148
ESCA.thisStudy.CD8 24148
FTC.thisStudy.CD4 24148
FTC.thisStudy.CD8 24148
HCC.ChunhongZheng2017.CD4 12582
HCC.ChunhongZheng2017.CD8 12582
HCC.QimingZhang2019 10X.CD4 58233
HCC.QimingZhang2019 10X.CD8 58233
HCC.QimingZhang2019 SS2.CD4 60682
HCC.QimingZhang2019 SS2.CD8 60682
LC.QianqianSong2019.CD4 33694
LC.QianqianSong2019.CD8 33694
LC.RapolasZilionis2019.CD4 41861
LC.RapolasZilioni

In [4]:
dataset_names = [b.split('/')[-1].replace(' ', '.') for b in bases]
dataset_names = [b.replace('LC.RapolasZilionis2019', 'NSCLC.RapolasZilionis2019') for b in dataset_names]
dataset_names = [b.replace('LC.QianqianSong2019', 'NSCLC.QianqianSong2019') for b in dataset_names]
dataset_names = [b.replace('LC.XinyiGuo2018', 'NSCLC.XinyiGuo2018') for b in dataset_names]

dataset_names

['AML.PeterVanGalen2019.CD4',
 'AML.PeterVanGalen2019.CD8',
 'BCC.KathrynEYost2019.CD4',
 'BCC.KathrynEYost2019.CD8',
 'BCL.thisStudy.CD4',
 'BCL.thisStudy.CD8',
 'BRCA.ElhamAzizi2018.10X.CD4',
 'BRCA.ElhamAzizi2018.10X.CD8',
 'BRCA.ElhamAzizi2018.Indrop.CD4',
 'BRCA.ElhamAzizi2018.Indrop.CD8',
 'BRCA.PeterSavas2018.CD4',
 'BRCA.PeterSavas2018.CD8',
 'BRCA.thisStudy.CD4',
 'BRCA.thisStudy.CD8',
 'CHOL.thisStudy.CD4',
 'CHOL.thisStudy.CD8',
 'CRC.LeiZhang2018.CD4',
 'CRC.LeiZhang2018.CD8',
 'CRC.LeiZhang2020.10X.CD4',
 'CRC.LeiZhang2020.10X.CD8',
 'ESCA.thisStudy.CD4',
 'ESCA.thisStudy.CD8',
 'FTC.thisStudy.CD4',
 'FTC.thisStudy.CD8',
 'HCC.ChunhongZheng2017.CD4',
 'HCC.ChunhongZheng2017.CD8',
 'HCC.QimingZhang2019.10X.CD4',
 'HCC.QimingZhang2019.10X.CD8',
 'HCC.QimingZhang2019.SS2.CD4',
 'HCC.QimingZhang2019.SS2.CD8',
 'NSCLC.QianqianSong2019.CD4',
 'NSCLC.QianqianSong2019.CD8',
 'NSCLC.RapolasZilionis2019.CD4',
 'NSCLC.RapolasZilionis2019.CD8',
 'NSCLC.XinyiGuo2018.CD4',
 'NSCLC.Xinyi

In [5]:
dataset_names_merged = list([b[:-4] for b in dataset_names])

In [6]:
complete_meta = pd.read_csv('../../../Data/PerDataset/Pancancer/Complete_Published_Metadata.csv', sep=',', index_col=0)
complete_meta['dataset'] = complete_meta['dataset'].replace({'BRCA.ElhamAzizi2018.InDrop':'BRCA.ElhamAzizi2018.Indrop'})
complete_meta.head()

  complete_meta = pd.read_csv('../../../Data/PerDataset/Pancancer/Complete_Published_Metadata.csv', sep=',', index_col=0)


Unnamed: 0,patient,cellID,libraryID,cancerType,loc,batchV,TCR,dataset,ClusterID,dataset.tech,...,dataset.old,sampleID,treatment,stype,patient.uid,usedForFreq,dataSource,tech,tech.cate,pub
1,BC9,s1_AAACCTGAGCAGACTG-1,BC9T,BRCA,T,BC9,,BRCA.ElhamAzizi2018.10X,BC.Elham2018.10X.C01,Elham2018.10X,...,BC.Elham2018.10X,BC9T,baseline,CD8,BRCA.ElhamAzizi2018.10X.BC9,Y,other labs,10X,Droplet,published
2,BC9,s1_AAACCTGAGGTCGGAT-1,BC9T,BRCA,T,BC9,,BRCA.ElhamAzizi2018.10X,BC.Elham2018.10X.C06,Elham2018.10X,...,BC.Elham2018.10X,BC9T,baseline,CD8,BRCA.ElhamAzizi2018.10X.BC9,Y,other labs,10X,Droplet,published
3,BC9,s1_AAACCTGAGTGTACTC-1,BC9T,BRCA,T,BC9,,BRCA.ElhamAzizi2018.10X,BC.Elham2018.10X.C01,Elham2018.10X,...,BC.Elham2018.10X,BC9T,baseline,CD8,BRCA.ElhamAzizi2018.10X.BC9,Y,other labs,10X,Droplet,published
4,BC9,s1_AAACCTGCAGATGGGT-1,BC9T,BRCA,T,BC9,,BRCA.ElhamAzizi2018.10X,BC.Elham2018.10X.C04,Elham2018.10X,...,BC.Elham2018.10X,BC9T,baseline,CD8,BRCA.ElhamAzizi2018.10X.BC9,Y,other labs,10X,Droplet,published
5,BC9,s1_AAACCTGGTAGCACGA-1,BC9T,BRCA,T,BC9,,BRCA.ElhamAzizi2018.10X,BC.Elham2018.10X.C00,Elham2018.10X,...,BC.Elham2018.10X,BC9T,baseline,CD8,BRCA.ElhamAzizi2018.10X.BC9,Y,other labs,10X,Droplet,published


In [7]:
tech_info = complete_meta[['dataset', 'tech']].drop_duplicates()
tech_info.index = tech_info['dataset']

In [8]:
for d in dataset_names_merged:
    if d not in complete_meta['dataset'].values:
        print(d)

In [9]:
stats = []
for i,d in enumerate(dataset_names_merged):
    stats.append([i,d,adatas[i].shape[1],adatas[i].shape[0], tech_info.at[d, 'tech']])

stats = pd.DataFrame(stats, columns=['#', 'name', 'ngenes', 'ncells', 'tech'])
stats

Unnamed: 0,#,name,ngenes,ncells,tech
0,0,AML.PeterVanGalen2019,27899,879,SeqWell
1,1,AML.PeterVanGalen2019,27899,910,SeqWell
2,2,BCC.KathrynEYost2019,23309,10689,10X
3,3,BCC.KathrynEYost2019,23309,10322,10X
4,4,BCL.thisStudy,28855,4237,10X
...,...,...,...,...,...
57,57,SCC.KathrynEYost2019,18347,10925,10X
58,58,THCA.thisStudy,24148,23508,10X
59,59,THCA.thisStudy,24148,33450,10X
60,60,UCEC.thisStudy,24148,12729,10X


In [10]:
print('All cells: %d' % stats['ncells'].sum())
print('All 10X cells: %d' % stats.loc[stats['tech']=='10X', 'ncells'].sum())

ind = (stats['tech']=='10X') & (stats['ngenes']>20000)
print('All 10X cells w/ > 20K genes: %d' % stats.loc[ind, 'ncells'].sum())


All cells: 373380
All 10X cells: 325928
All 10X cells w/ > 20K genes: 228349


In [16]:
complete_meta['dataset'].isin(dataset_names_merged).value_counts()

True     373380
False     24430
Name: dataset, dtype: int64

In [60]:
colstotransfer = ['sampleID', 'treatment', 'stype', 'patient.uid', 'tech']

In [61]:
for i, d in enumerate(dataset_names_merged):
    dmeta = complete_meta.loc[complete_meta['dataset']==d, :]
    if dmeta['cellID'].value_counts().iloc[0] > 1:
        sys.exit('Problem')
    dmeta.index = dmeta['cellID']
    for c in colstotransfer:
        adatas[i].obs.loc[:, c] = dmeta.loc[adatas[i].obs.index, c]

In [62]:
adatas_filt = [adatas[i] for i in range(len(adatas)) if i in stats.loc[ind, '#'].values]
names = [dataset_names_merged[i] for i in range(len(adatas)) if i in stats.loc[ind, '#'].values]

In [63]:
overlapping_genes = set(adatas_filt[0].var.index)
print(len(overlapping_genes))
for i in range(1, len(adatas_filt)):
    attempt_overlap = overlapping_genes.intersection(set(adatas_filt[i].var.index))
    if len(attempt_overlap)<0:
        print('skipping %s' % names[i])
    else:
        overlapping_genes = attempt_overlap
        print(i, names[i], len(overlapping_genes), adatas_filt[i].shape[1], adatas_filt[i].shape[0])

23309
1 BCC.KathrynEYost2019 23309 23309 10322
2 BCL.thisStudy 23107 28855 4237
3 BCL.thisStudy 23107 28855 3482
4 BRCA.thisStudy 21623 24148 3063
5 BRCA.thisStudy 21623 24148 4291
6 ESCA.thisStudy 21623 24148 12358
7 ESCA.thisStudy 21623 24148 12526
8 FTC.thisStudy 21623 24148 270
9 FTC.thisStudy 21623 24148 767
10 HCC.QimingZhang2019.10X 21142 58233 4743
11 HCC.QimingZhang2019.10X 21142 58233 11303
12 NSCLC.QianqianSong2019 21142 33694 356
13 NSCLC.QianqianSong2019 21142 33694 849
14 MM.thisStudy 21142 28855 3645
15 MM.thisStudy 21142 28855 8629
16 OV.thisStudy 21142 24148 1006
17 OV.thisStudy 21142 24148 3517
18 PACA.JunyaPeng2019 18728 24005 918
19 PACA.JunyaPeng2019 18728 24005 728
20 PACA.thisStudy 18728 24148 3903
21 PACA.thisStudy 18728 24148 5957
22 RC.MatthewDYoung2018 18728 33694 1389
23 RC.MatthewDYoung2018 18728 33694 3139
24 RC.thisStudy 18728 24148 10105
25 RC.thisStudy 18728 24148 16544
26 THCA.thisStudy 18728 24148 23508
27 THCA.thisStudy 18728 24148 33450
28 UCEC.this

In [64]:
overlapping_genes = list(overlapping_genes)

In [65]:
for i in range(len(adatas_filt)):
    adatas_filt[i] = adatas_filt[i][:,overlapping_genes]

In [66]:
merged_adata = adatas_filt[0].concatenate(adatas_filt[1:])


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


In [67]:
merged_adata.var = merged_adata.var[['display.name',  'geneID-14', 'seu.id-0']]
merged_adata.var.columns = ['display.name', 'geneID', 'seu.id']

In [68]:
sc.write('../../../Data/PerDataset/Pancancer/pancancer_zheng.10X.Greater20KgeneDatasets.h5ad', merged_adata)

Only considering the two last: ['.Greater20KgeneDatasets', '.h5ad'].
Only considering the two last: ['.Greater20KgeneDatasets', '.h5ad'].


In [69]:
merged_adata_noPBMC = merged_adata[merged_adata.obs['loc']!='P', :]

In [70]:
sc.write('../../../Data/PerDataset/Pancancer/pancancer_zheng.10X.Greater20KgeneDatasets.NoPBMC.h5ad', merged_adata_noPBMC)

Only considering the two last: ['.NoPBMC', '.h5ad'].
Only considering the two last: ['.NoPBMC', '.h5ad'].


In [71]:
merged_adata_noPBMC.obs['dataset'].value_counts()

THCA.zhangLab5P          56958
UCEC.zhangLab5P          32655
RC.zhangLab5P            26649
ESCA.zhangLab5P          24884
BCC.KathrynEYost2019     21011
HCC.YaoHe10X             10966
PACA.zhangLab5P           9860
MM.zhangLab5P             8279
BC.zhangLab5P             7354
RC.MatthewDYoung2018      4528
OV.zhangLab5P             4523
BCL.zhangLab5P            3493
PACA.JunyaPeng2019        1646
LUNG.QianqianSong2019     1205
FTC.zhangLab5P            1037
Name: dataset, dtype: int64