# 10X Multiome dataset: Dataset preprocessing
|  | #cells | #genes | #peaks| #cell types|
| --- | --- | --- | --- | --- |
| Raw data | 11909 | 36601 |108377|NAN|
| Preprocessed data | 10412 | 21618 |105949|19|

In [1]:
from help_func import *

In [2]:
data = sc.read_10x_h5('./Raw data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', gex_only=False)
data

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11909 × 144978
    var: 'gene_ids', 'feature_types', 'genome'

have both Gene Expression and Peak features

In [3]:
data.var['feature_types'].unique()

array(['Gene Expression', 'Peaks'], dtype=object)

filter scATAC data

In [4]:
keep_peaks = data.var[data.var['feature_types']=='Peaks']
keep_peaks = keep_peaks[['chr' in keep_peaks['gene_ids'].iloc[i] for i in range(keep_peaks.shape[0])]]
ATAC_data = filter_peaks(data, keep_peaks.index)
ATAC_data

AnnData object with n_obs × n_vars = 11909 × 108344
    var: 'gene_ids', 'feature_types', 'genome'

filter scRNA data

In [5]:
keep_genes = data.var[data.var['feature_types']=='Gene Expression']
RNA_data = filter_peaks(data, keep_genes.index)
RNA_data

AnnData object with n_obs × n_vars = 11909 × 36621
    var: 'gene_ids', 'feature_types', 'genome'

#### adding annotation information of peaks，including mapping gene

In [6]:
atac_peak_annotation = pd.read_table('./Raw data/pbmc_granulocyte_sorted_10k_atac_peak_annotation.tsv', sep='\t')
atac_peak_annotation = atac_peak_annotation[['chr' in atac_peak_annotation['peak'].iloc[i] for i in range(atac_peak_annotation.shape[0])]]
peaks_rename = []
for i in range(atac_peak_annotation.shape[0]):
    str_temp = re.split('_', atac_peak_annotation['peak'].iloc[i])
    peaks_rename.append(str_temp[0] + ':' + str_temp[1] + '-'+str_temp[2])
atac_peak_annotation['peak'] = peaks_rename
atac_peak_annotation.index = list(atac_peak_annotation['peak'])
atac_peak_annotation.head()

Unnamed: 0,peak,gene,distance,peak_type
chr1:10109-10357,chr1:10109-10357,MIR1302-2HG,-19197,distal
chr1:180730-181630,chr1:180730-181630,AL627309.5,-6869,distal
chr1:191491-191736,chr1:191491-191736,AL627309.5,-17630,distal
chr1:267816-268196,chr1:267816-268196,AP006222.2,962,distal
chr1:586028-586373,chr1:586028-586373,AC114498.1,-1256,distal


In [7]:
atac_peak_annotation = atac_peak_annotation.join(
    pd.DataFrame([re.split(r'[:,-]', atac_peak_annotation['peak'].iloc[i]) 
                  for i in range(atac_peak_annotation.shape[0])],
                 columns=['chr', 'from', 'to'],
                 index=atac_peak_annotation.index))
atac_peak_annotation.head()

Unnamed: 0,peak,gene,distance,peak_type,chr,from,to
chr1:10109-10357,chr1:10109-10357,MIR1302-2HG,-19197,distal,chr1,10109,10357
chr1:180730-181630,chr1:180730-181630,AL627309.5,-6869,distal,chr1,180730,181630
chr1:191491-191736,chr1:191491-191736,AL627309.5,-17630,distal,chr1,191491,191736
chr1:267816-268196,chr1:267816-268196,AP006222.2,962,distal,chr1,267816,268196
chr1:586028-586373,chr1:586028-586373,AC114498.1,-1256,distal,chr1,586028,586373


In [8]:
ATAC_data.var = atac_peak_annotation.loc[ATAC_data.var['gene_ids']]
ATAC_data

AnnData object with n_obs × n_vars = 11909 × 108344
    var: 'peak', 'gene', 'distance', 'peak_type', 'chr', 'from', 'to'

#### adding annotation information of cells，including cell type

In [9]:
cell_clust = pd.read_csv('./Raw data/wnn_meta_data.csv', index_col=0)
cell_clust.head(2)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,nCount_ATAC,nFeature_ATAC,nCount_SCT,nFeature_SCT,SCT.weight,ATAC.weight,wsnn_res.0.8,seurat_clusters,sub.cluster,celltype
AAACAGCCAAGGAATC-1,SeuratProject,8380,3308,7.470167,55550,13867,4780,2754,0.438258,0.561742,1,1,1,CD4 Naive
AAACAGCCAATCCCTT-1,SeuratProject,3771,1896,10.527711,20485,7247,3781,1895,0.506867,0.493133,4,4,4,CD4 TCM


adding to scATAC data

In [10]:
ATAC_data = filter_cells(ATAC_data, cell_clust.index)
ATAC_data.obs['celltype'] = cell_clust.loc[ATAC_data.obs.index]['celltype']
ATAC_data

AnnData object with n_obs × n_vars = 10412 × 108344
    obs: 'celltype'
    var: 'peak', 'gene', 'distance', 'peak_type', 'chr', 'from', 'to'

adding to scRNA data

In [11]:
RNA_data = filter_cells(RNA_data, cell_clust.index)
RNA_data.obs['celltype'] = cell_clust.loc[RNA_data.obs.index]['celltype']
RNA_data

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 10412 × 36621
    obs: 'celltype'
    var: 'gene_ids', 'feature_types', 'genome'

#### save data

In [12]:
sc.pp.filter_genes(RNA_data, min_cells=10)
sc.pp.filter_cells(RNA_data, min_genes=1)
RNA_data

AnnData object with n_obs × n_vars = 10412 × 21618
    obs: 'celltype', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'

In [13]:
sc.pp.filter_genes(ATAC_data, min_cells=10)
sc.pp.filter_cells(ATAC_data, min_genes=1)
ATAC_data

AnnData object with n_obs × n_vars = 10412 × 105949
    obs: 'celltype', 'n_genes'
    var: 'peak', 'gene', 'distance', 'peak_type', 'chr', 'from', 'to', 'n_cells'

In [14]:
RNA_data.write('./Processed data/10X_pbmc_RNA.h5ad')
ATAC_data.write('./Processed data/10X_pbmc_ATAC.h5ad')

#### filtering cell types included T cells, B cells and monocytes to comparing with PCHIC data

In [15]:
keep_celltype = ['CD4 Naive', 'CD4 TCM', 'CD8 Naive', 'CD16 Mono', 'CD14 Mono', 'CD8 TEM_1', 'Naive B', 'CD4 TEM', 'Memory B', 'CD8 TEM_2']
keep_cell = ATAC_data.obs[ATAC_data.obs['celltype'].isin(keep_celltype)].index
ATAC_data_for_pchic = filter_cells(ATAC_data, keep_cell)

In [16]:
sc.pp.filter_cells(ATAC_data_for_pchic, min_genes=10)
sc.pp.filter_genes(ATAC_data_for_pchic, min_cells=1)
ATAC_data_for_pchic

AnnData object with n_obs × n_vars = 8794 × 105948
    obs: 'celltype', 'n_genes'
    var: 'peak', 'gene', 'distance', 'peak_type', 'chr', 'from', 'to', 'n_cells'

In [17]:
ATAC_data_for_pchic.write('./Processed data/10X_pbmc_ATAC_celltype_filtered.h5ad')