In [1]:
# Parameters
cpu = 1
group_name = "Isocortex"
mem_gb = 1


## Import

In [2]:
from ALLCools.clustering import *
from ALLCools.mcds import MCDS
from wmb import brain, cemba_atac, mm10
import numpy as np
import pandas as pd
import anndata

import matplotlib.pyplot as plt
from ALLCools.plot import *

In [3]:
n_cell = 5
remove_lower_features = 0.2
zscore_abs_cutoff = 3

## Select cells

In [4]:
cells = pd.read_csv('atac_cells.txt', index_col=0, header=None).index
cells.name = 'cell'
cells.size

500000

## Get adata with basic feature selection

In [5]:
atac_ds = MCDS.open(cemba_atac.CEMBA_ATAC_ZARR_PATH,
                    use_obs=cells,
                    var_dim='chrom5k')

In [6]:
atac_ds

Unnamed: 0,Array,Chunk
Bytes,254.35 GiB,520.90 MiB
Shape,"(500000, 546206)","(1000, 546206)"
Count,501 Tasks,500 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 254.35 GiB 520.90 MiB Shape (500000, 546206) (1000, 546206) Count 501 Tasks 500 Chunks Type uint8 numpy.ndarray",546206  500000,

Unnamed: 0,Array,Chunk
Bytes,254.35 GiB,520.90 MiB
Shape,"(500000, 546206)","(1000, 546206)"
Count,501 Tasks,500 Chunks
Type,uint8,numpy.ndarray


In [7]:
adata = atac_ds.get_count_adata(da_name='chrom5k_da',
                                binarize_cutoff=0,
                                sparse=True,
                                loading_chunk=30000)

Loading chunk 0-30000/500000


Loading chunk 30000-60000/500000


Loading chunk 60000-90000/500000


Loading chunk 90000-120000/500000


Loading chunk 120000-150000/500000


Loading chunk 150000-180000/500000


Loading chunk 180000-210000/500000


Loading chunk 210000-240000/500000


Loading chunk 240000-270000/500000


Loading chunk 270000-300000/500000


Loading chunk 300000-330000/500000


Loading chunk 330000-360000/500000


Loading chunk 360000-390000/500000


Loading chunk 390000-420000/500000


Loading chunk 420000-450000/500000


Loading chunk 450000-480000/500000


Loading chunk 480000-500000/500000


## Basic Feature Filtering

In [8]:
chroms = [
    'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
    'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
    'chr18', 'chr19'
]

remove_chromosomes(adata, include_chromosomes=chroms)

492558 regions remained.


In [9]:
filter_regions(adata, n_cell=n_cell, zscore_abs_cutoff=zscore_abs_cutoff)

470522 regions remained.


In [10]:
remove_black_list_region(adata, black_list_path=mm10.ENCODE_BLACKLIST_PATH)

14529 features removed due to overlapping (bedtools intersect -f 0.2) with black list regions.


In [11]:
if adata.shape[1] > 50000:
    feature_sum = adata.X.sum(axis=0).A1
    use_features = feature_sum > np.quantile(feature_sum, remove_lower_features)
    adata = adata[:, use_features]

In [12]:
adata.write_h5ad('atac_input.h5ad')

In [13]:
pd.Series(adata.var_names).to_csv('atac_features.txt', index=None, header=False)

In [14]:
adata

View of AnnData object with n_obs × n_vars = 500000 × 364670
    obs: 'read_count'
    var: 'chrom', 'end', 'start'