# Select peaks

Filter the MACS2 called peaks down to those which have a hypothesized higher probability of being involved in genetic effects.

Also create Matrix eQTL chromatin accessibility phenotype input

**Input**:

## Setup

In [None]:
import os
import sys
import gc
import json
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc

In [None]:
# Ensure cwd is project root and that /code is in path

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [None]:
from helpers.python.utils import create_dir, list_dirs, list_files_and_links, \
                                 ct_format, ct_format_alt
from helpers.python.anndata_utils import check_anndata

## Variables

In [None]:
# User variables

from glob_vars import ATAC_PEAKS_PROCESSED_H5AD, \
                      ATAC_CHROM_ACCESS_DIR, \
                      CT_MAP_ID, CT_MAP_JSON, \
                      SELECT_PEAKS_TSV_DIR, \
                      MATRIX_EQTL_INPUT_DIR, \
                      PRECOMPUTED_EQTLS_TSV, \
                      MAIN_ENV

cell_type = str('DL-EN')

In [None]:
n_top_hvps = 10000 # 1 ct: cell-level | Top n ranked peaks to consider in highly-variable-peaks
min_mean_acells = 0.0125 # 1 ct: cell-level | Minimum mean value across all cells and donors within ct
max_mean_acells = 3 # 1 ct: cell-level | Maximum mean value across all cells and donors within ct

min_pval = 0.01 # Min p-value for marker peaks to be considered in the first place in differential peak accessibility
n_top_markers = 10000 # All cts: donor-level (cell agg.) | Top n ranking markers peaks to consider

min_mean_adonors = 0.002 # 1 ct: donor-level (cell agg.) | Minimum threshold of peak mean across donors
min_donors = 0.26 # 1 ct: donor-level (cell agg.) | Minimum percentage of donors a peak needs to be found in AKA Peak sparsity filter
min_score = 9 #  1 ct: donor-level (cell agg.) | 30449843Minimum score (resulting -log10(q-val) of peak calling w MACS2

In [None]:
os.environ['PATH'] = f'/omics/groups/OE0540/internal_temp/users/fichtner/micromamba/envs/{MAIN_ENV}/bin:' + os.environ['PATH']
from pybedtools import BedTool

In [None]:
cell_type_alt = ct_format_alt(cell_type)
cell_type = ct_format(cell_type)

In [None]:
# Get ct-ann --> grouped-ct mappings
with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

# Format
ct_map_alt = {ct_format_alt(key): [ct_format_alt(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i_alt = {ct_format_alt(old_ct): ct_format_alt(new_ct) for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

ct_map = {ct_format(key): [ct_format(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

## Load data

In [None]:
adata_all_cts_cells = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_all-cell-types_cell-lvl.h5ad')
adata_all_cts_ctd = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_all-cell-types_cell-type-donor-lvl_averag-agg.h5ad')
adata_ct_cells = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_ct-filtered_cell-lvl.h5ad')
adata_ct_donors = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_ct-filtered_donor-lvl_average-agg.h5ad')

df_ct_donors = adata_ct_donors.to_df()
df_ct_donors

## Filter peaks

**Approach**

Peak group A = 
1. Highly variable peaks (HVPs) within cell-type OR
2. Marker peaks (DAPs) for the cell-type OR
3. Overlapping with pre-computed eQTLs OR
4. Near eGenes OR

Peak group B =
1. Min. nr. of donors within cell-type
2. Min. mean within cell-type
3. Min. score

Peak selection = A intersect B

## A1. HVPs (cell-type specific)

In [None]:
# Remove batches w with min cell nr
cell_counts_per_donor_batch = adata_ct_cells.obs.groupby('batch').transform('size')
adata_ct_cells_batch_filtered = adata_ct_cells[cell_counts_per_donor_batch >= 10, :].copy()

In [None]:
# Seurat (expect log)
sc.pp.highly_variable_genes(adata_ct_cells_batch_filtered, flavor='seurat', batch_key='batch', n_top_genes=adata_ct_cells_batch_filtered.n_vars)
sc.pl.highly_variable_genes(adata_ct_cells_batch_filtered)

In [None]:
# Extract top n HVPs
merge = pd.merge(adata_ct_cells_batch_filtered.var['means'].to_frame(),
                adata_ct_cells_batch_filtered.var['highly_variable_nbatches'].to_frame(),
                left_index=True,
                right_index=True,
                how='inner')

merge = pd.merge(merge,
                adata_ct_cells_batch_filtered.var['dispersions_norm'].abs().to_frame(),
                left_index=True,
                right_index=True,
                how='inner')

merge = merge[(merge['means'] > min_mean_acells) & (merge['means'] < max_mean_acells)]

merge = merge.sort_values(by=['highly_variable_nbatches', 'dispersions_norm'], ascending=[False, False])
peaks_hvp = set(merge[0:n_top_hvps].index)

## A2. DAPs (cell-type specific, computed on all cell-types)

In [None]:
sc.tl.rank_genes_groups(adata_all_cts_ctd, groupby='cell_type', method='t-test', rankby_abs=True)
sc.pl.rank_genes_groups(adata_all_cts_ctd, n_genes=25, sharey=False)

In [None]:
daps = sc.get.rank_genes_groups_df(adata_all_cts_ctd, group=cell_type, pval_cutoff=min_pval)
peaks_markers = set(daps['names'][0:n_top_markers])
daps

In [None]:
del adata_all_cts_ctd
gc.collect()

## A3. Peaks overlapping with pre-computed eQTLs

### Make peaks bed

In [None]:
peaks_df = adata_all_cts_cells.var[['chr', 'start', 'end', 'peak_name']].copy()
peaks_df['start'] = peaks_df['start']

peaks_bed = BedTool(peaks_df.values.tolist())

print(len(peaks_bed))
peaks_bed

### Make eQTLs bed

In [None]:
eqtls = pd.read_csv(PRECOMPUTED_EQTLS_TSV,
                   sep='\t',
                   header=0,
                   index_col=21)
eqtls.columns

In [None]:
# Make eQTL bed file

eqtls_df = eqtls[['snp_chromosome', 'snp_position', 'beta', 'celltype']].copy()

# Filter out 'Discard' marked cell-types
eqtls_df = eqtls_df[~eqtls_df['celltype'].isin(ct_map_alt['Discard'])].copy()

eqtls_df['start'] = eqtls_df['snp_position'] - 1 # Make index 0-based open
eqtls_df.reset_index(inplace=True)
eqtls_df['chr'] = 'chr' + eqtls_df['snp_chromosome'].astype(str)
eqtls_df = eqtls_df.rename(columns={'snp_position': 'end', 'QTL': 'id'})

eqtls_df = eqtls_df[['chr', 'start', 'end', 'id']].sort_values(by=['chr', 'start'], ascending=[True, True])

eqtls_bed = BedTool(eqtls_df.values.tolist())

len(eqtls_bed)

### Peaks intersecting w eQTLs

In [None]:
peaks_eqtl_bed = peaks_bed.intersect(eqtls_bed, u=True)


peaks_eqtl = []

for fields in peaks_eqtl_bed:

    peaks_eqtl.append(fields[3])

print(len(peaks_eqtl))

peaks_eqtl = set(peaks_eqtl)

## A4. Peaks near eGenes

In [None]:
eGenes = set(eqtls.loc[~eqtls['gene_name'].isna(), 'gene_name'].unique())

In [None]:
len(eGenes)

In [None]:
peaks_eGenes = set(adata_ct_cells.var[adata_ct_cells.var['nearest_gene'].isin(eGenes)].index)

len(peaks_eGenes)

In [None]:
del adata_ct_cells
gc.collect()

## Peaks B1: Peaks with min donors with non-zero ca count (intra-cell-type)

In [None]:
non0s = (df_ct_donors != 0).mean()

non0s.describe()

In [None]:
non0s.plot(kind='hist',
           bins=np.linspace(0, 1, 51),
           title="Non-zero donor count distr.",
           ylabel="#")

In [None]:
peaks_min_donors = set(df_ct_donors.loc[:, non0s > min_donors].columns.to_list())

In [None]:
# Density of CA

non0 = np.count_nonzero(adata_ct_donors.X)
all_ = np.product(adata_ct_donors.X.shape)
general_ca_density = round(non0 / all_, 2)
print(f'General matrix density: {general_ca_density}')

## Peaks B2: peaks with min mean across donors (intra-cell-type)

In [None]:
agg_mean = df_ct_donors.mean()

In [None]:
agg_mean.describe()

In [None]:
agg_mean.plot(kind='hist',
              bins=300, title="Peak mean across donors distr.",
              xlabel="",
              ylabel="#")

In [None]:
agg_mean.plot(kind='hist', 
              bins=np.linspace(0, 0.015, 31),
              title="Peak mean across donors ZOOM distr.",
              xlabel="",
              ylabel="#")

In [None]:
agg_mean.plot(kind='density',
              xlim=[0, 0.015],
              title="Peak mean across donors ZOOM distr.",
              xlabel="")

In [None]:
peaks_min_mean = set(df_ct_donors.loc[:, agg_mean > min_mean_adonors].columns.tolist())

## B3. Peaks with min peak score value

In [None]:
peak_scores = adata_ct_donors.var['score']

In [None]:
peak_scores.describe()

In [None]:
peak_scores.plot(kind='hist',
                 bins=np.linspace(0, 60, 61),
                 figsize=(14,3),
                 title="Peak score distr.",
                 ylabel="#")

In [None]:
peaks_min_score = set(adata_ct_donors.var[(adata_ct_donors.var['score'] >= min_score).tolist()].index.tolist())

In [None]:
del df_ct_donors
gc.collect()

## Export final peaks

In [None]:
print(f'Len peaks_hvp = {len(peaks_hvp)}\n' \
      f'peaks_hvp = {list(peaks_hvp)[0:4]}')

print(f'Len peaks_markers = {len(peaks_markers)}\n' \
      f'peaks_markers = {list(peaks_markers)[0:4]}')

print(f'Len peaks_eqtl = {len(peaks_eqtl)}\n' \
      f'peaks_eqtl = {list(peaks_eqtl)[0:4]}')

print(f'Len peaks_eGenes = {len(peaks_eGenes)}\n' \
      f'peaks_eGenes = {list(peaks_eGenes)[0:4]}')

print(f'Len peaks_min_mean = {len(peaks_min_mean)}\n' \
      f'peaks_min_mean = {list(peaks_min_mean)[0:4]}')

print(f'Len peaks_min_donors = {len(peaks_min_donors)}\n' \
      f'peaks_min_donors = {list(peaks_min_donors)[0:4]}')

print(f'Len peaks_min_score = {len(peaks_min_score)}\n' \
      f'peaks_min_score = {list(peaks_min_score)[0:4]}')

In [None]:
final_peaks = list((peaks_hvp | peaks_markers | peaks_eqtl | peaks_eGenes) & (peaks_min_mean | peaks_min_donors | peaks_min_score)) # remove peaks_min_mean or peaks_min_donors


# Export bed

final_peaks_df = adata_all_cts_cells.var.loc[final_peaks, ['chr', 'start', 'end', 'peak_name']].copy()
final_peaks_bed = BedTool(final_peaks_df.values.tolist()).sort()

final_peaks_bed_path = f'{SELECT_PEAKS_TSV_DIR}/{cell_type}/peaks_ca-qtls.bed'
create_dir(final_peaks_bed_path)
final_peaks_bed.saveas(final_peaks_bed_path)

print(len(final_peaks))
final_peaks

## Export final adata

In [None]:
adata_ca = adata_ct_donors[:, final_peaks].copy() # Filtered for final peak set
adata_ca

In [None]:
adata_ca.var_names

### Make caPCs

In [None]:
sc.pp.pca(adata_ca)

In [None]:
adata_ca_path = f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_ca-qtls.h5ad'
create_dir(adata_ca_path)
adata_ca.write(adata_ca_path, compression='gzip')

## Export caQTL phenotype tsv

In [None]:
# Create pheno matrix

# Get donor -> donor_id mappint
# TODO: annotate adata_ca in process_peaks_adata

donor_id_map = adata_all_cts_cells.obs[['donor', 'donor_id']].groupby('donor')['donor_id'].unique().to_dict()
assert all(len(values) == 1 for values in donor_id_map.values())
donor_id_map = {key: values[0] for key, values in donor_id_map.items()}

pheno_out = adata_ca.to_df().T
pheno_out = pheno_out.rename(columns=donor_id_map)
pheno_out = pheno_out.rename_axis('peak_id')

pheno_out

In [None]:
# Export peak chromatin accessibility
pheno_out_path = f'{MATRIX_EQTL_INPUT_DIR}/chromatin-accessibility/{CT_MAP_ID}/{cell_type}/bulk-tests/phenotype.tsv'
create_dir(pheno_out_path)
pheno_out.round(6).to_csv(pheno_out_path, sep='\t', na_rep='NaN')

In [None]:
del adata_all_cts_cells
del adata_ct_donors
gc.collect()