## Run to preprocess scATAC-seq data and reorder the peaks: 
the chromosomes are first sorted in order from 1 to 24, and then the inner peaks of each chromosome are sorted

In [1]:
import sys
sys.path.append('../Scarp/')

from data_preprocessing import sort_peaks, construct_3col_to_sparse_mat
import re
from scipy.sparse import csc_matrix
import scanpy as sc
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### for the data downloaded from https://github.com/jsxlei/SCALE

In [2]:
Data_name = ['InSilico', 'GM12878vsHEK', 'Breast_Tumor',
             'Leukemia', 'GM12878vsHL', 'Forebrain', 'Splenocyte']

In [3]:
for data_name in Data_name:
    data_file = './Raw data/' + data_name + '.h5ad'
    save_file = './Processed data/' + data_name + '.h5ad'

    print('============================%s================================' % data_name)
    data = sc.read_h5ad(data_file)

    Cells = data.obs.index
    labels = data.obs['celltype'].astype('category')

    # label Breast_Tumor 
    if data_name == 'Breast_Tumor':
        cell_type_split = np.array([i.split('-') for i in Cells])
        labels = pd.DataFrame([cell_type_split[:, 1][i] + '(' + labels[i] + ')' for i in range(labels.shape[0])],
                              index=labels.index, columns=['label'])
        labels = labels['label'].astype('category')
        data.obs['celltype'] = labels

    Cells_num, Peaks_num = data.X.shape
    cluster_num = np.unique(labels).shape[0]
    print('Number of Peaks:', Peaks_num)
    print('Number of Cells:', Cells_num)
    print('Number of Clusters: ', cluster_num)

    data = sort_peaks(data)
    data.write(save_file)

Number of Peaks: 13668
Number of Cells: 828
Number of Clusters:  6
Number of Peaks: 12938
Number of Cells: 526
Number of Clusters:  3
Number of Peaks: 27884
Number of Cells: 384
Number of Clusters:  4
Number of Peaks: 7602
Number of Cells: 391
Number of Clusters:  6
Number of Peaks: 10431
Number of Cells: 597
Number of Clusters:  3
Number of Peaks: 11285
Number of Cells: 2088
Number of Clusters:  8
Number of Peaks: 77453
Number of Cells: 3166
Number of Clusters:  12


### for blood2K dataset

In [4]:
Count_df = pd.read_csv('./Raw data/blood2K/Filter_df.txt', sep='\t')
metadata = pd.read_csv('./Raw data/blood2K/metadata.txt', sep='\t', header=None, index_col=0, names=['label'])
Cells = pd.read_csv('./Raw data/blood2K/Filter_Cells.txt', sep='\t', header=None)
Peaks = pd.read_csv('./Raw data/blood2K/Filter_Peaks.txt', sep='\t', header=None)

In [5]:
Peaks_num = Count_df['Peaks'].max()
Cells_num = Count_df['Cells'].max()
print('Number of Peaks:', Peaks_num)
print('Number of Cells:', Cells_num)

Number of Peaks: 134962
Number of Cells: 2034


In [6]:
sparse_mat = construct_3col_to_sparse_mat(Count_df, 
                                          normalization=False, 
                                          binarization=False, 
                                          return_shape='CP')

In [7]:
adata = sc.AnnData(sparse_mat)
adata.var_names_make_unique()
adata.var = pd.DataFrame(index=list(Peaks[0]))
adata.obs = metadata
adata.obs.columns = ['celltype']
adata

AnnData object with n_obs × n_vars = 2034 × 134962
    obs: 'celltype'

In [8]:
adata = sort_peaks(adata)
adata.write('./Processed data/blood2K.h5ad')

### for SOX10 knockdown dataset

In [9]:
file = './Raw data/SOX10/GSE114557_MM057_MM087_Sox10KD_GoodCells_mergedPeaks_correctformat.counts.txt'
count_data = pd.read_table(file, index_col=0)

In [10]:
cell_info = count_data.columns.to_list()
cell_info = np.array([re.split('_|\W+', i) for i in cell_info])
cell_type = [cell_info[i, 2] + '_' + cell_info[i, 4] for i in range(cell_info.shape[0])]

In [11]:
adata = sc.AnnData(csc_matrix(count_data.T))
adata.var.index = count_data.index
adata.obs.index = count_data.columns
adata.obs['celltype'] = pd.Series(cell_type, dtype='category', index=count_data.columns)
adata

AnnData object with n_obs × n_vars = 598 × 78661
    obs: 'celltype'

In [12]:
adata = sort_peaks(adata)
adata.write('./Processed data/Sox10KD.h5ad')