# Process peaks anndata

- Check integrity
- Subset to donors also in genotype data
- Subset to cell-type
- Pseudo-bulk to:
    - cell-type-donor level (mean-aggregation across cells)
    - donor-level for a specific cell-type (mean-aggregation across cells)

## Setup

In [1]:
import os
import sys
import gc
import anndata as ad
import scanpy as sc

In [2]:
# Ensure cwd is project root and that /code is in path

import os
import sys

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [3]:
from helpers.python.utils import create_dir
from helpers.python.anndata_utils import check_anndata, subset_common_donors, port_obs_adata

### Variables

In [4]:
from glob_vars import ATAC_PEAKS_H5AD_NEW, ATAC_PEAKS_PROCESSED_H5AD, GENOTYPES_TSV, GENOTYPE_PCS_TSV, ATAC_CHROM_ACCESS_DIR, CT_MAP_ID

## Load anndata

In [5]:
adata = ad.read_h5ad(ATAC_PEAKS_H5AD_NEW)
adata

AnnData object with n_obs × n_vars = 265053 × 736845
    obs: 'BlacklistRatio', 'nDiFrags', 'nFrags', 'nMonoFrags', 'nMultiFrags', 'NucleosomeRatio', 'PassQC', 'PromoterRatio', 'ReadsInBlacklist', 'ReadsInPromoter', 'ReadsInTSS', 'Sample', 'TSSEnrichment', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'donor_id', 'clone', 'organoid', 'donor', 'leiden', 'stressed_vs_unstressed_celltypes', 'ReadsInPeaks', 'FRIP', 'barcode', 'batch', 'sample', 'barcode_batch', 'cell_type', 'cell_type_custom'
    var: 'chr', 'start', 'end', 'strand', 'peak_name', 'score', 'GC', 'nearest_gene', 'peak_type'

In [6]:
# Check anndata integrity
check_anndata(adata)

## Process anndata

In [7]:
# Format

# Make .obs: cell_type_custom --> cell_type
adata.obs.drop(columns=['cell_type'], inplace=True)
adata.obs['cell_type'] = adata.obs['cell_type_custom']
adata.obs.drop(columns=['cell_type_custom'], inplace=True)

In [8]:
adata.obs['donor_id'].nunique()

72

In [9]:
# Remove donors not found in genotype data
adata_processed = subset_common_donors(adata, GENOTYPES_TSV, GENOTYPE_PCS_TSV)

del adata
gc.collect()

973

In [10]:
adata_processed.obs['donor_id'].nunique()

70

In [11]:
adata_processed.shape

(260972, 736845)

In [12]:
# Remove cell-types marked 'Discard'
if 'Discard' in adata_processed.obs['cell_type'].cat.categories:

    adata_processed = adata_processed[~(adata_processed.obs['cell_type'] == 'Discard'), :].copy()
    
    if 'Discard' in adata_processed.obs['cell_type'].cat.categories:

        adata_processed.obs['cell_type'] = adata_processed.obs['cell_type'].cat.remove_categories('Discard')

print(adata_processed.obs['cell_type'].cat.categories)

Index(['DL-EN', 'Glia', 'Midbrain-EN', 'Neural-progenitors', 'UL-EN'], dtype='object')


In [13]:
adata_processed.shape

(195931, 736845)

### Export

In [14]:
create_dir(ATAC_PEAKS_PROCESSED_H5AD)
adata_processed.write(ATAC_PEAKS_PROCESSED_H5AD, compression='gzip')

## Subset and/or pseudobulk anndata

In [15]:
# Cell-type-donor -level pseudobulk
# REMINDER: sc.get.aggregate() removes groups without any members

adata_ctdonors = sc.get.aggregate(adata_processed,
                                by=['cell_type', 'donor'],
                                func=['mean'],
                                axis='obs')

adata_ctdonors.X = adata_ctdonors.layers['mean'].copy()
del adata_ctdonors.layers['mean']

adata_ctdonors_out_path = f'{os.path.dirname(ATAC_PEAKS_PROCESSED_H5AD)}/peak_matrix_cell-type-donors-pseudobulk.h5ad'
create_dir(adata_ctdonors_out_path)
adata_ctdonors.write(adata_ctdonors_out_path, compression='gzip')

print(f'n_ct_donor_groups after pseudo-bulking" {adata_ctdonors.n_obs}')

del adata_ctdonors
gc.collect()

n_ct_donor_groups after pseudo-bulking" 316


1079

In [16]:
print(f'n_cells before: {adata_processed.shape[0]}')

for ct in adata_processed.obs['cell_type'].unique():


    print(f'Processing cell-type: {ct}')


    # Subset to cell-type
    adata_ct_cells = adata_processed[adata_processed.obs['cell_type'] == ct, :].copy()

    adata_ct_cells_out_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/{CT_MAP_ID}/{ct}/peak_matrix_cells_{ct}.h5ad'
    create_dir(adata_ct_cells_out_path)
    adata_ct_cells.write(adata_ct_cells_out_path, compression='gzip')

    print(f'n_cells after cell-type subset: {adata_ct_cells.shape[0]}')


    # Cell-type specific donor-level pseudobulk
    adata_ct_donors = sc.get.aggregate(adata_ct_cells,
                                       by=['donor'],
                                       func=['mean'],
                                       axis='obs')

    adata_ct_donors.X = adata_ct_donors.layers['mean'].copy()
    del adata_ct_donors.layers['mean']


    # Port .obs from adata_processed ('donor_id' important for downstream QTL testing)
    kwargs = {'grouping_col': 'donor', 'filter_col': 'cell_type', 'filter_key': ct, 'obs_map_col': 'index'}
    adata_ct_donors = port_obs_adata(adata_ct_donors, adata_processed, suffix='_all_cells', neg_filter_ref=['leiden'], kwargs=kwargs)

    adata_ct_donors_out_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/{CT_MAP_ID}/{ct}/peak_matrix_donors-pseudobulk_{ct}.h5ad'
    create_dir(adata_ct_donors_out_path)
    adata_ct_donors.write(adata_ct_donors_out_path, compression='gzip')

    print(f'n_donors after pseudo-bulking" {adata_ct_donors.n_obs}')

    del adata_ct_donors
    gc.collect()
    


n_cells before: 195931
Processing cell-type: DL-EN


n_cells after cell-type subset: 30210


n_donors after pseudo-bulking" 66
Processing cell-type: Glia


n_cells after cell-type subset: 52409


n_donors after pseudo-bulking" 70
Processing cell-type: UL-EN


n_cells after cell-type subset: 41200


n_donors after pseudo-bulking" 52
Processing cell-type: Neural-progenitors


n_cells after cell-type subset: 52780


n_donors after pseudo-bulking" 59
Processing cell-type: Midbrain-EN


n_cells after cell-type subset: 19332


n_donors after pseudo-bulking" 69
