# Process peaks anndata

- Check integrity
- Subset to donors also in genotype data
- Subset to cell-type
- ~~Remove Discard cells~~
- Pseudo-bulk to:
    - cell-type-donor level (mean-aggregation across cells)
    - donor-level for a specific cell-type (mean-aggregation across cells)

## Setup

In [None]:
import os
import sys
import gc
import anndata as ad
import scanpy as sc

In [None]:
# Ensure cwd is project root and that /code is in path

import os
import sys

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [None]:
from helpers.python.utils import create_dir
from helpers.python.anndata_utils import check_anndata, subset_common_donors, port_obs_adata

### Variables

In [None]:
from glob_vars import ATAC_PEAKS_H5AD_NEW, ATAC_PEAKS_PROCESSED_H5AD, GENOTYPES_TSV, GENOTYPE_PCS_TSV, ATAC_CHROM_ACCESS_DIR, CT_MAP_ID

In [None]:
# User
min_cells = 10 # All cts: cell-level | Minimum amount of cells a peak needs to for the peak to be retained (required for functions to work)

## Load anndata

In [None]:
adata = ad.read_h5ad(ATAC_PEAKS_H5AD_NEW)
adata

In [None]:
# Check anndata integrity
check_anndata(adata)

## Process generic anndata

In [None]:
# Format

# Make .obs: cell_type_custom --> cell_type
adata.obs.drop(columns=['cell_type'], inplace=True)
adata.obs['cell_type'] = adata.obs['cell_type_custom']
adata.obs.drop(columns=['cell_type_custom'], inplace=True)

adata.obs['donor_id'].nunique()

In [None]:
# Remove donors not found in genotype data
adata_processed = subset_common_donors(adata, GENOTYPES_TSV, GENOTYPE_PCS_TSV)

del adata
gc.collect()

adata_processed.obs['donor_id'].nunique()

In [None]:
# Export complete adata
create_dir(ATAC_PEAKS_PROCESSED_H5AD)
adata_processed.write(ATAC_PEAKS_PROCESSED_H5AD, compression='gzip')

In [None]:
# ct-donor pseudobulk
# REMINDER: sc.get.aggregate() removes groups without any members

adata_ctdonors = sc.get.aggregate(adata_processed,
                                by=['cell_type', 'donor'],
                                func=['mean'],
                                axis='obs')

adata_ctdonors.X = adata_ctdonors.layers['mean'].copy()
del adata_ctdonors.layers['mean']


# Export
adata_ctdonors_out_path = f'{os.path.dirname(ATAC_PEAKS_PROCESSED_H5AD)}/peak_matrix_cell-type-donor-lvl_averag-agg.h5ad'
create_dir(adata_ctdonors_out_path)
adata_ctdonors.write(adata_ctdonors_out_path, compression='gzip')

print(f'n_ct_donor_groups after pseudo-bulking" {adata_ctdonors.n_obs}')

del adata_ctdonors
gc.collect()

## Process cell-type subsets of anndata

In [None]:
print(f'Shape before: {adata_processed.shape[0]}\n')


for ct in adata_processed.obs['cell_type'].unique():

    if ct == 'Discard':

        continue


    print(f'Processing cell-type: {ct}')

    adata_processed_all_cts = adata_processed.copy()



    ## Remove cell-types marked 'Discard'

    if 'Discard' in adata_processed_all_cts.obs['cell_type'].cat.categories:

        adata_processed_all_cts = adata_processed_all_cts[~(adata_processed_all_cts.obs['cell_type'] == 'Discard'), :].copy()
    
    if 'Discard' in adata_processed_all_cts.obs['cell_type'].cat.categories:

        adata_processed_all_cts.obs['cell_type'] = adata_processed_all_cts.obs['cell_type'].cat.remove_categories('Discard')


    ## Subset to cell-type
    adata_ct_cells = adata_processed_all_cts[adata_processed_all_cts.obs['cell_type'] == ct, :].copy()


    ## Remove peaks with positive chromatin accessibility in less than X cells
    ##   Required for highly variable peak calculation

    sc.pp.filter_genes(adata_ct_cells, min_cells=min_cells)


    ## Remove filtered peaks from cell-level anndata
    adata_processed_all_cts = adata_processed_all_cts[:, adata_ct_cells.var_names].copy()


    ## Save Subsets

    # All cell-types: cell-level
    adata_processed_all_cts_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{ct}/peak_matrix_all-cell-types_cell-lvl.h5ad'
    create_dir(adata_processed_all_cts_path)
    adata_processed_all_cts.write(adata_processed_all_cts_path, compression='gzip')

    # All cell-types: ct-donor-level pseudobulk
    # REMINDER: sc.get.aggregate() removes groups without any members

    adata_ctdonors = sc.get.aggregate(adata_processed_all_cts,
                                        by=['cell_type', 'donor'],
                                        func=['mean'],
                                        axis='obs')

    adata_ctdonors.X = adata_ctdonors.layers['mean'].copy()
    del adata_ctdonors.layers['mean']

    adata_ctdonors_out_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{ct}/peak_matrix_all-cell-types_cell-type-donor-lvl_averag-agg.h5ad'
    create_dir(adata_ctdonors_out_path)
    adata_ctdonors.write(adata_ctdonors_out_path, compression='gzip')

    print(f'n_ct_donor_groups after pseudo-bulking" {adata_ctdonors.n_obs}')

    del adata_ctdonors
    gc.collect()


    # Cell-type subset: cell-level

    adata_ct_cells_out_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{ct}/peak_matrix_ct-filtered_cell-lvl.h5ad'
    create_dir(adata_ct_cells_out_path)
    adata_ct_cells.write(adata_ct_cells_out_path, compression='gzip')

    print(f'n_cells after cell-type subset: {adata_ct_cells.shape[0]}')


    # Cell-type subset: donor-level pseudobulk

    adata_ct_donors = sc.get.aggregate(adata_ct_cells,
                                       by=['donor'],
                                       func=['mean'],
                                       axis='obs')

    adata_ct_donors.X = adata_ct_donors.layers['mean'].copy()
    del adata_ct_donors.layers['mean']


    # Port .obs from adata_processed ('donor_id' important for downstream QTL testing)
    kwargs = {'grouping_col': 'donor', 'filter_col': 'cell_type', 'filter_key': ct, 'obs_map_col': 'index'}
    adata_ct_donors = port_obs_adata(adata_ct_donors, adata_processed, suffix='_all_cells', neg_filter_ref=['leiden'], kwargs=kwargs)

    adata_ct_donors_out_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{ct}/peak_matrix_ct-filtered_donor-lvl_average-agg.h5ad'
    create_dir(adata_ct_donors_out_path)
    adata_ct_donors.write(adata_ct_donors_out_path, compression='gzip')

    print(f'n_donors after pseudo-bulking" {adata_ct_donors.n_obs}')

    del adata_ct_donors
    gc.collect()
    
