# Make matrix-eqtl input for footprint-QTLs (bulk-tests)
sdf
Inputs:
- FOOTPRINTS_DIR/ footprints_<CT>_processed.h5ad
- GENOTYPES_TSV, GENOTYPE_PCS_TSV

Outputs:
- MATRIX_EQTL_INPUT_DIR
- MATRIX_EQTL_OUTPUT_DIR

Difference to old file:
- n_gt_PCs: 20 --> 10, parametrized
- Dropped repeated snps from genotype
- Changed SNP loc approach to source from GT tsv
- Join cov matrices before output
- Add insertions per donor as covariate
- bulk/single-test parameter

## Setup

In [None]:
import os
import sys
import json
import gc
from datetime import date
import numpy as np
import pandas as pd
import anndata as ad
import pyBigWig

In [None]:
# Ensure cwd is project root and that /code is in path

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [None]:
from helpers.python.utils import create_dir, list_dirs, ct_format, ct_format_alt, list_files_and_links
from helpers.python.anndata_utils import check_anndata

## Variables

In [None]:
# User variables

from glob_vars import FOOTPRINTS_DIR, FOOTPRINTS_METADATA_DIR, \
                      DATASET, CT_MAP_JSON, CT_MAP_ID, \
                      GENOTYPES_PROCESSED_TSV, GENOTYPE_PCS_TSV, SNP_LOCS_BED, \
                      MATRIX_EQTL_INPUT_DIR, MATRIX_EQTL_OUTPUT_DIR, \
                      GROUPED_BIGWIG_FILES_DIR, \
                      MAIN_ENV

cell_type = str("DL-EN")
RUN_ID = 'meqtl_io_' + f"{date.today().strftime('%Y-%m-%d')}_{CT_MAP_ID}_{DATASET}"

mode = 'peak-tests' # {bulk-tests, single-tests, peak-tests} Perform all tests in bulk and leverage matrix operations or have fine-tuned control over the individual tests

n_ca_pcs = 2 # Nr of chromatin accessipeaks_testbility PCs to use as covariate
n_gt_pcs = 10 # Nr of genotype PCs to use as covariate
min_cells_donor = 200 # This should be 0 since this filter should be applied before footprint computation

In [None]:
os.environ['PATH'] = f'/omics/groups/OE0540/internal_temp/users/fichtner/micromamba/envs/{MAIN_ENV}/bin:' + os.environ['PATH']
from pybedtools import BedTool

In [None]:
cell_type_alt = ct_format_alt(cell_type)
cell_type = ct_format(cell_type)

In [None]:
# Get cell-type grouping map

with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

ct_map_alt = {ct_format_alt(key): [ct_format_alt(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i_alt = {ct_format_alt(old_ct): ct_format_alt(new_ct) for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

ct_map = {ct_format(key): [ct_format(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

In [None]:
# Variable checks

if not mode:
    
    print('The mode has not been specified {bulk-test, single-test, peak-test}. ABORTING.')
    os._exit(1)

# Process adata

### Load data

In [None]:
# Phenotype

adata = ad.read_h5ad(os.path.join(FOOTPRINTS_DIR, 'footprints_' + cell_type + '_processed.h5ad'))
donors = adata.obs.index.to_list()
donor_ids = adata.obs['donor_id'].to_list()
peaks = adata.var.index.to_list()
n_donors, n_peaks = adata.shape

adata

In [None]:
# Checks

obs_criteria_kwargs = [
    {'col': 'n_cells', 'func': lambda x: (x > min_cells_donor).all()}
    ]

check_anndata(adata, min_obs=20, obs_criteria_kwargs=obs_criteria_kwargs)

In [None]:
# Genotype

gt = pd.read_csv(GENOTYPES_PROCESSED_TSV, sep='\t', header=0, index_col=0).rename_axis('snp_id')
gt = gt[donor_ids]
gt

In [None]:
snps = gt.index.to_list()

In [None]:
# Checks
assert len(set(snps)) == len(snps), f'SNP labels not unique: total = {len(snps)}, unique = {len(set(snps))}'

### Compute snp-peak pairs for single-test mode

In [None]:
any(mode in option for option in ['single-tests', 'peak-tests'])

In [None]:
if any(mode in option for option in ['single-tests', 'peak-tests']):

    # snp -> peak pairs for testing

    snps_bed = BedTool(SNP_LOCS_BED)
    peaks_bed = BedTool(f'{FOOTPRINTS_METADATA_DIR}/{cell_type}/peaks.bed')

    sp_pairs = snps_bed.intersect(peaks_bed, wa=True, wb=True)


    # Save pairs bed

    sp_pairs_out1 = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/tests_snp_peak_pairs.bed'
    create_dir(sp_pairs_out1)
    sp_pairs.saveas(sp_pairs_out1)

    sp_pairs_out2 = f'{MATRIX_EQTL_OUTPUT_DIR}/{cell_type}/{mode}/tests_snp_peak_pairs.bed'
    create_dir(sp_pairs_out2)
    sp_pairs.saveas(sp_pairs_out2)


    # Unique snps & peaks

    snps_test = []
    peaks_test = []

    for pair in sp_pairs:

        snps_test.append(pair.fields[3])
        peaks_test.append(pair.fields[7])

    snps_test = list(set(snps_test))
    peaks_test = list(set(peaks_test))


    # peak-snps map

    if mode == 'peak-tests':

        peak_snps_map = {p: [] for p in peaks_test} # {peak: [snps]}

        for pair in sp_pairs:

            peak_snps_map[pair.fields[7]].append(pair.fields[3])

# Matrix eQTL IO

## Genotype

In [None]:
if mode == 'bulk-tests':

    gt_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/genotype_NA.tsv'
    create_dir(gt_path)
    gt.to_csv(gt_path, sep='\t', na_rep='NaN')

else:

    # Source
    gt_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/genotype_NA_source.tsv'
    create_dir(gt_path)
    gt.to_csv(gt_path, sep='\t', na_rep='NaN')


if mode == 'single-tests':

    for snp in snps_test:

        # Individual
        gt_snp_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/genotypes/genotype_NA%{snp}.tsv'
        create_dir(gt_snp_path)
        gt.loc[[snp], :].to_csv(gt_snp_path, sep='\t', na_rep='NaN')

elif mode == 'peak-tests':

    for peak in peaks_test:

        # Individual
        gt_snp_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/genotypes/genotype_NA%{peak}.tsv'
        create_dir(gt_snp_path)
        gt.loc[peak_snps_map[peak], :].to_csv(gt_snp_path, sep='\t', na_rep='NaN')

## Phenotype matrix

In [None]:
# Create pheno matrix
donor_id_map = adata.obs['donor_id'].to_dict()

pheno_out = adata.to_df().T
pheno_out = pheno_out.rename(columns=donor_id_map)
pheno_out = pheno_out.rename_axis('peak_id')

pheno_out

In [None]:
if mode == 'bulk-tests':

    pheno_out_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/phenotype.tsv'
    create_dir(pheno_out_path)
    pheno_out.round(6).to_csv(pheno_out_path, sep='\t', na_rep='NaN')

elif any(mode in option for option in ['single-tests', 'peak-tests']):

    # Source
    pheno_out_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/phenotype_source.tsv'
    create_dir(pheno_out_path)
    pheno_out.round(6).to_csv(pheno_out_path, sep='\t', na_rep='NaN')

    for peak in peaks_test:

        # Individual
        phe_peak_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/phenotypes/phenotype%{peak}.tsv'
        create_dir(phe_peak_path)
        pheno_out.round(6).loc[[peak], :].to_csv(phe_peak_path, sep='\t', na_rep='NaN')

In [None]:
del pheno_out
gc.collect()

## Peak locations

In [None]:
peak_locs = adata.var[['peak_name', 'chr', 'start', 'end']].copy()

peak_locs['start'] = peak_locs['start'].astype(int)
peak_locs['end'] = peak_locs['end'].astype(int)

peak_locs['start'] = peak_locs['start'] - 1 # Peak naming is in 1-based fully closed notation. Format to 0-based half-open notation which is what is used in the genotype matri

peak_locs

In [None]:
if mode == 'bulk-tests':

    peak_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/peak_location.tsv'
    create_dir(peak_loc_path)
    peak_locs.to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

elif any(mode in option for option in ['single-tests', 'peak-tests']):

    # Source
    peak_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/peak_location_source.tsv'
    create_dir(peak_loc_path)
    peak_locs.to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

    for peak in peaks_test:

        # Individual
        peak_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/peak_locations/peak_location%{peak}.tsv'
        create_dir(peak_loc_path)
        peak_locs.round(6).loc[[peak], :].to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

In [None]:
del peak_locs
gc.collect()

## SNP locations

In [None]:
snp_locs = gt.index.to_frame()
snp_locs['chr'] = snp_locs.index.str.split('_').str[0]
snp_locs['pos'] = snp_locs.index.str.split('_').str[1].astype(int) - 1  # 1-based fully closed (snp) -> 0-based half-open (downstream, matrix-eQTL)
snp_locs

In [None]:
if mode == 'bulk-tests':

    snp_locs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/snp_location.tsv'
    create_dir(snp_locs_path)
    snp_locs.to_csv(snp_locs_path, sep='\t', index=False, na_rep='NaN')

elif any(mode in option for option in ['single-tests', 'peak-tests']):

    # Source
    snp_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/snp_location_source.tsv'
    create_dir(snp_loc_path)
    snp_locs.to_csv(snp_loc_path, sep='\t', index=False, na_rep='NaN')

    for peak in peaks_test:

        # Individual
        snp_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/snp_locations/snp_location%{peak}.tsv'
        create_dir(snp_loc_path)
        snp_locs.round(6).loc[peak_snps_map[peak], :].to_csv(snp_loc_path, sep='\t', index=False, na_rep='NaN')

In [None]:
del gt
del snp_locs
gc.collect()

## Covariates

### Phenotype PCs

In [None]:
phe_pcs = pd.DataFrame(adata.obsm['X_pca'][:, 0:n_ca_pcs]).set_index(adata.obs['donor_id'])

phe_pcs.columns = [f'phe_PC_{str(i)}' for i in range(1, len(phe_pcs.columns) + 1)]
phe_pcs = phe_pcs.T
phe_pcs = phe_pcs.rename_axis('id')

phe_pcs

### Genotype PCs

In [None]:
gt_pcs = pd.read_csv(GENOTYPE_PCS_TSV, sep="\t", index_col=0)

gt_pcs = gt_pcs.T
gt_pcs = gt_pcs[0:n_gt_pcs]
gt_pcs = gt_pcs.rename_axis('id').rename(index={idx: f'gt_{idx}' for idx in gt_pcs.index})

gt_pcs = gt_pcs[donor_ids]

gt_pcs

### Nr of insertions per donor

In [None]:
n_frags = adata.obs[['donor_id', 'n_insertions']].set_index('donor_id').T
n_frags

### Joint covariates

In [None]:
covs = pd.concat([gt_pcs, phe_pcs, n_frags])
covs = covs.rename_axis(index='id')
covs = covs.round(7)
covs   

In [None]:
if mode == 'bulk-tests':

    covs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/covariates.tsv'
    create_dir(covs_path)
    covs.to_csv(covs_path, sep='\t', na_rep='NaN')

elif any(mode in option for option in ['single-tests', 'peak-tests']):

    # Populate peak-donor-insertions map
    #   = extended peak-specific covariate rows

    peak_donor_ins_map = {peak: {} for peak in peaks_test}
                       # {'peak': {'donor1' : n_insertions, ...}}

    
    for donor, donor_id in zip(adata.obs['donor'], adata.obs['donor_id']):

        bw = pyBigWig.open(f'{GROUPED_BIGWIG_FILES_DIR}/{cell_type}/{donor}_{cell_type}.bw')

        for peak in peaks_test:

            chr, start, end = peak.split(':')[0:3]

            profile = bw.values(chr, int(start), int(end))
            n_ins = np.nansum(profile).astype(int)
            peak_donor_ins_map[peak][donor_id] = n_ins

        bw.close()


    # For CBPNet populate new map snp: donors: score and also extend covs in next section


    # Export covs_extended

    # Source
    covs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/covariates_source.tsv'
    create_dir(covs_path)
    covs.to_csv(covs_path, sep='\t', na_rep='NaN')

    for peak in peaks_test:

        covs_extended = covs.copy()

        row_label = f'n_insertions_{peak}'
        covs_extended.loc[row_label] = pd.Series(peak_donor_ins_map[peak])

        # Individual
        covs_extended_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/{mode}/covariates/covariates%{peak}.tsv'
        create_dir(covs_extended_path)
        covs_extended.to_csv(covs_extended_path, sep='\t', na_rep='NaN')


        

In [None]:
del phe_pcs
del gt_pcs
del covs
gc.collect()

# QTL mapping stats

In [None]:
# Gather stats

stats = adata.obs[['n_cells', 'n_fragments']].describe()

mean_cells_donor, std_cells_donor = stats['n_cells'].to_list()[1:3]
mean_frags_donor, std_frags_donor = stats['n_fragments'].to_list()[1:3]

total_cells = adata.obs['n_cells'].sum()
total_frags = adata.obs['n_fragments'].sum()

In [None]:
stats_out_path = f'{MATRIX_EQTL_OUTPUT_DIR}/qtl_testing_metadata.tsv'

create_dir(stats_out_path)

with open(stats_out_path, 'a') as f:
        
    msg = f'\n{RUN_ID}\t' \
          f'{cell_type}\t' \
          f'{n_donors}\t' \
          f'{n_peaks}\t' \
          f'{total_cells}\t' \
          f'{mean_cells_donor:.1f}\t' \
          f'{std_cells_donor:.1f}\t' \
          f'{total_frags}\t' \
          f'{mean_frags_donor:.1f}\t' \
          f'{std_frags_donor:.1f}\t'

    f.write(msg)
    print(msg)