# Make matrix-eqtl input for footprint-QTLs (bulk-tests)
sdf
Inputs:
- FOOTPRINTS_DIR/ footprints_<CT>_processed.h5ad
- GENOTYPES_TSV, GENOTYPE_PCS_TSV

Outputs:
- MATRIX_EQTL_INPUT_DIR
- MATRIX_EQTL_OUTPUT_DIR

Difference to old file:
- n_gt_PCs: 20 --> 10, parametrized
- Dropped repeated snps from genotype
- Changed SNP loc approach to source from GT tsv
- Join cov matrices before output
- Add insertions per donor as covariate
- bulk/single-test parameter

## Setup

In [1]:
import os
import sys
import json
import gc
from datetime import date
import numpy as np
import pandas as pd
import anndata as ad
import pyBigWig

In [2]:
# Ensure cwd is project root and that /code is in path

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [3]:
from helpers.python.utils import create_dir, list_dirs, ct_format, ct_format_alt, list_files_and_links
from helpers.python.anndata_utils import check_anndata

## Variables

In [None]:
# User variables

from glob_vars import FOOTPRINTS_DIR, FOOTPRINTS_METADATA_DIR, \
                      DATASET, CT_MAP_JSON, CT_MAP_ID, \
                      GENOTYPES_PROCESSED_TSV, GENOTYPE_PCS_TSV, SNP_LOCS_BED, \
                      MATRIX_EQTL_INPUT_DIR, MATRIX_EQTL_OUTPUT_DIR, \
                      GROUPED_BIGWIG_FILES_DIR, \
                      MAIN_ENV

cell_type = str("DL-EN")
RUN_ID = 'meqtl_io_' + f"{date.today().strftime('%Y-%m-%d')}_{CT_MAP_ID}_{DATASET}"

mode = 'bulk-tests' # {bulk, single-tests} Perform all tests in bulk and leverage matrix operations or have fine-tuned control over the individual tests

n_ca_pcs = 2 # Nr of chromatin accessipeaks_testbility PCs to use as covariate
n_gt_pcs = 10 # Nr of genotype PCs to use as covariate
min_cells_donor = 200 # This should be 0 since this filter should be applied before footprint computation

In [5]:
os.environ['PATH'] = f'/omics/groups/OE0540/internal_temp/users/fichtner/micromamba/envs/{MAIN_ENV}/bin:' + os.environ['PATH']
from pybedtools import BedTool

In [6]:
cell_type_alt = ct_format_alt(cell_type)
cell_type = ct_format(cell_type)

In [7]:
# Get cell-type grouping map

with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

ct_map_alt = {ct_format_alt(key): [ct_format_alt(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i_alt = {ct_format_alt(old_ct): ct_format_alt(new_ct) for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

ct_map = {ct_format(key): [ct_format(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

In [8]:
# Variable checks

if not mode:
    
    print('The mode has not been specified {bulk, single-test}. ABORTING.')
    os._exit(1)

# Process adata

### Load data

In [9]:
# Phenotype

adata = ad.read_h5ad(os.path.join(FOOTPRINTS_DIR, 'footprints_' + cell_type + '_processed.h5ad'))
donors = adata.obs.index.to_list()
donor_ids = adata.obs['donor_id'].to_list()
peaks = adata.var.index.to_list()
n_donors, n_peaks = adata.shape

adata

AnnData object with n_obs × n_vars = 23 × 52184
    obs: 'cell_type', 'donor', 'donor_id', 'n_fragments', 'n_insertions', 'n_cells', 'mean_BlacklistRatio', 'mean_nDiFrags', 'mean_nFrags', 'mean_nMonoFrags', 'mean_nMultiFrags', 'mean_NucleosomeRatio', 'mean_PassQC', 'mean_PromoterRatio', 'mean_ReadsInBlacklist', 'mean_ReadsInPromoter', 'mean_ReadsInTSS', 'mean_TSSEnrichment', 'mean_n_genes_by_counts', 'mean_total_counts', 'mean_total_counts_mt', 'mean_ReadsInPeaks', 'mean_FRIP', 'leiden'
    var: 'peak_name', 'chr', 'start', 'end', 'length', 'strand', 'score', 'GC', 'nearest_gene', 'peak_type', 'mean', 'std', 'var', 'min', 'max', '25%', '50%', '75%', 'highly_variable_std', 'std_rank'
    uns: 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [10]:
# Checks

obs_criteria_kwargs = [
    {'col': 'n_cells', 'func': lambda x: (x > min_cells_donor).all()}
    ]

check_anndata(adata, min_obs=20, obs_criteria_kwargs=obs_criteria_kwargs)

In [None]:
# Genotype

gt = pd.read_csv(GENOTYPES_PROCESSED_TSV, sep='\t', header=0, index_col=0).rename_axis('snp_id')
gt = gt[donor_ids]
gt

Unnamed: 0_level_0,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr10_100000235_C_T,2.0,1.0,1.0,2.0,1.0,2.0,1.0,0.0,2.0,2.0,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
chr10_100000943_G_A,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,...,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
chr10_100000979_T_C,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
chr10_100002628_A_C,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,1.0,...,0.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0
chr10_100002875_A_G,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr9_9999539_A_G,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0
chr9_99997250_C_T,1.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,1.0,...,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0
chr9_99998141_T_C,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0
chr9_99998283_G_C,1.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,1.0,...,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0


In [12]:
snps = gt.index.to_list()

In [13]:
# Checks
assert len(set(snps)) == len(snps), f'SNP labels not unique: total = {len(snps)}, unique = {len(set(snps))}'

### Compute snp-peak pairs for single-test mode

In [None]:
if mode == 'single-tests':

    # Load snp & peak beds
    snps_bed = BedTool(SNP_LOCS_BED)
    peaks_bed = BedTool(f'{FOOTPRINTS_METADATA_DIR}/{cell_type}/peaks.bed')

    sp_pairs = snps_bed.intersect(peaks_bed, wa=True, wb=True)

    # Save
    sp_pairs_out = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/tests_snp_peak_pairs.bed'
    create_dir(sp_pairs_out)
    sp_pairs.saveas(sp_pairs_out)

    # Unique snps & peaks
    snps_test = []
    peaks_test = []

    for pair in sp_pairs:

        snps_test.append(pair.fields[3])
        peaks_test.append(pair.fields[7])

    snps_test = list(set(snps_test))
    peaks_test = list(set(peaks_test))

# Matrix eQTL IO

## Genotype

In [None]:
if mode == 'bulk-tests':

    gt_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/bulk-tests/genotype_NA.tsv'
    create_dir(gt_path)
    gt.to_csv(gt_path, sep='\t', na_rep='NaN')

elif mode == 'single-tests':

    for snp in snps_test:

            gt_snp_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/genotypes/genotype_NA%{snp}.tsv'
            create_dir(gt_snp_path)
            gt.loc[[snp], :].to_csv(gt_snp_path, sep='\t', na_rep='NaN')

## Phenotype matrix

In [40]:
# Create pheno matrix
donor_id_map = adata.obs['donor_id'].to_dict()

pheno_out = adata.to_df().T
pheno_out = pheno_out.rename(columns=donor_id_map)
pheno_out = pheno_out.rename_axis('peak_id')

pheno_out

Unnamed: 0_level_0,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
peak_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1:959075:959575:501:*:81,0.369150,0.284992,0.086813,0.283657,0.215781,0.158809,0.178541,0.285842,0.180357,0.319397,...,0.165601,0.345571,0.333956,0.176368,0.141357,0.218932,0.255028,0.181181,0.108867,0.025896
chr1:959772:960272:501:*:82,0.859094,0.758506,0.409222,0.859974,0.726152,0.576395,0.742753,0.747131,0.656477,0.794033,...,0.503435,0.789781,0.735878,0.866385,0.570566,0.708728,0.669340,0.835997,0.518255,0.284295
chr1:960337:960837:501:*:83,0.555698,0.338364,0.156712,0.437346,0.265091,0.256447,0.321558,0.406971,0.307299,0.431725,...,0.262010,0.496904,0.512544,0.346679,0.227104,0.324324,0.419280,0.353228,0.237525,0.066721
chr1:960860:961360:501:*:84,0.708207,0.629761,0.331899,0.654841,0.452320,0.419059,0.687515,0.727912,0.471406,0.697660,...,0.465409,0.791356,0.737451,0.573263,0.365411,0.588251,0.547160,0.616845,0.533662,0.166791
chr1:961452:961952:501:*:85,0.881593,0.771192,0.658151,0.856320,0.737430,0.731270,0.853962,0.956734,0.800825,0.894686,...,0.808239,0.890791,0.815718,0.816977,0.492239,0.846082,0.784308,0.797614,0.629941,0.286941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX:154516091:154516591:501:*:19548,0.440028,0.247756,0.129557,0.305002,0.264777,0.173648,0.264370,0.300399,0.243560,0.386333,...,0.190471,0.289234,0.423039,0.246019,0.150159,0.286693,0.272647,0.223457,0.122888,0.035727
chrX:154547055:154547555:501:*:19562,0.537267,0.267354,0.125402,0.351725,0.276141,0.245184,0.272343,0.329193,0.260819,0.423217,...,0.271091,0.398001,0.463659,0.257727,0.165330,0.338005,0.323550,0.269279,0.150696,0.048670
chrX:154762463:154762963:501:*:19586,0.314002,0.257951,0.103098,0.301015,0.215991,0.149035,0.244136,0.228619,0.209028,0.293792,...,0.172669,0.356641,0.324691,0.182265,0.138984,0.210996,0.272778,0.187136,0.124103,0.028794
chrX:155026619:155027119:501:*:19611,0.578542,0.363728,0.174877,0.405894,0.334562,0.206473,0.353087,0.352630,0.352215,0.419695,...,0.279740,0.445492,0.509894,0.334709,0.248623,0.303026,0.412987,0.305069,0.191285,0.052918


In [None]:
if mode == 'bulk-tests':

    pheno_out_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/bulk-tests/footprints.tsv'
    create_dir(pheno_out_path)
    pheno_out.round(6).to_csv(pheno_out_path, sep='\t')

elif mode == 'single-tests':

    for peak in peaks_test:

            phe_peak_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/phenotypes/footprints%{peak}.tsv'
            create_dir(phe_peak_path)
            pheno_out.round(6).loc[[peak], :].to_csv(phe_peak_path, sep='\t', na_rep='NaN')

In [None]:
del pheno_out
gc.collect()

## Peak locations

In [52]:
peak_locs = adata.var[['peak_name', 'chr', 'start', 'end']].copy()

peak_locs['start'] = peak_locs['start'].astype(int)
peak_locs['end'] = peak_locs['end'].astype(int)

peak_locs['start'] = peak_locs['start'] - 1 # Peak naming is in 1-based fully closed notation. Format to 0-based half-open notation which is what is used in the genotype matri

peak_locs

Unnamed: 0,peak_name,chr,start,end
chr1:959075:959575:501:*:81,chr1:959075:959575:501:*:81,chr1,959074,959575
chr1:959772:960272:501:*:82,chr1:959772:960272:501:*:82,chr1,959771,960272
chr1:960337:960837:501:*:83,chr1:960337:960837:501:*:83,chr1,960336,960837
chr1:960860:961360:501:*:84,chr1:960860:961360:501:*:84,chr1,960859,961360
chr1:961452:961952:501:*:85,chr1:961452:961952:501:*:85,chr1,961451,961952
...,...,...,...,...
chrX:154516091:154516591:501:*:19548,chrX:154516091:154516591:501:*:19548,chrX,154516090,154516591
chrX:154547055:154547555:501:*:19562,chrX:154547055:154547555:501:*:19562,chrX,154547054,154547555
chrX:154762463:154762963:501:*:19586,chrX:154762463:154762963:501:*:19586,chrX,154762462,154762963
chrX:155026619:155027119:501:*:19611,chrX:155026619:155027119:501:*:19611,chrX,155026618,155027119


In [None]:
if mode == 'bulk-tests':

    peak_locs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/bulk-tests/peak_locations.tsv'
    create_dir(peak_locs_path)
    peak_locs.to_csv(peak_locs_path, sep='\t', index=False)

elif mode == 'single-tests':

    for peak in peaks_test:

            peak_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/peak_locations/peak_location%{peak}.tsv'
            create_dir(peak_loc_path)
            peak_locs.round(6).loc[[peak], :].to_csv(peak_loc_path, sep='\t', na_rep='NaN')

In [None]:
del peak_locs
gc.collect()

## SNP locations

In [65]:
snp_locs = gt.index.to_frame()
snp_locs['chr'] = snp_locs.index.str.split('_').str[0]
snp_locs['pos'] = snp_locs.index.str.split('_').str[1].astype(int) - 1  # 1-based fully closed (snp) -> 0-based half-open (downstream, matrix-eQTL)
snp_locs

Unnamed: 0_level_0,snp_id,chr,pos
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr10_100000235_C_T,chr10_100000235_C_T,chr10,100000234
chr10_100000943_G_A,chr10_100000943_G_A,chr10,100000942
chr10_100000979_T_C,chr10_100000979_T_C,chr10,100000978
chr10_100002628_A_C,chr10_100002628_A_C,chr10,100002627
chr10_100002875_A_G,chr10_100002875_A_G,chr10,100002874
...,...,...,...
chr9_9999539_A_G,chr9_9999539_A_G,chr9,9999538
chr9_99997250_C_T,chr9_99997250_C_T,chr9,99997249
chr9_99998141_T_C,chr9_99998141_T_C,chr9,99998140
chr9_99998283_G_C,chr9_99998283_G_C,chr9,99998282


In [None]:
if mode == 'bulk-tests':

    snp_locs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/bulk-tests/snp_locations.tsv'
    create_dir(snp_locs_path)
    snp_locs.to_csv(snp_locs_path, index=False, header=False, sep='\t')

elif mode == 'single-tests':

    for snp in snps_test:

            snp_loc_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/snp_locations/snp_location%{snp}.tsv'
            create_dir(snp_loc_path)
            snp_locs.round(6).loc[[snp], :].to_csv(snp_loc_path, sep='\t', na_rep='NaN')

In [None]:
del gt
del snp_locs
gc.collect()

## Covariates

### Phenotype PCs

In [15]:
phe_pcs = pd.DataFrame(adata.obsm['X_pca'][:, 0:n_ca_pcs]).set_index(adata.obs['donor_id'])

phe_pcs.columns = [f'phe_PC_{str(i)}' for i in range(1, len(phe_pcs.columns) + 1)]
phe_pcs = phe_pcs.T
phe_pcs = phe_pcs.rename_axis('id')

phe_pcs

donor_id,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
phe_PC_1,19.103226,4.928822,-25.653589,14.462182,3.712495,-11.100549,4.46753,12.559984,1.670928,17.06031,...,-2.470864,15.112147,17.886436,4.270626,-13.447748,5.235102,11.609981,4.867715,-16.268799,-69.867424
phe_PC_2,-11.89787,-0.740765,9.634737,-6.001926,3.113373,4.356004,1.680385,-4.049774,4.190908,-9.890193,...,5.517859,-6.469783,-8.999649,3.752746,8.278536,2.967491,-1.952761,3.907892,9.599813,-19.424572


### Genotype PCs

In [16]:
gt_pcs = pd.read_csv(GENOTYPE_PCS_TSV, sep="\t", index_col=0)

gt_pcs = gt_pcs.T
gt_pcs = gt_pcs[0:n_gt_pcs]
gt_pcs = gt_pcs.rename_axis('id').rename(index={idx: f'gt_{idx}' for idx in gt_pcs.index})

gt_pcs = gt_pcs[donor_ids]

gt_pcs

iid,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gt_PC1,-424.3459,-587.08734,25.649767,13.56072,1183.7717,253.76321,-11.026092,3.230694,-6.109256,50.206284,...,-26.086426,-141.93439,-23.225708,-7.199219,227.28793,12.186271,-8.161153,24.835945,-14.844159,-44.876938
gt_PC2,-333.00827,367.2751,9.594588,11.334282,-210.52095,372.03177,-19.358488,-20.55713,-7.918764,-8.613843,...,12.337655,-963.92786,13.609195,13.413182,292.4741,42.351612,4.912662,39.3655,27.149958,-26.285961
gt_PC3,1047.887,587.0468,-17.569567,32.36879,36.929684,613.6229,30.143745,28.402182,-13.293022,35.816673,...,8.423677,-71.293526,46.57533,46.895866,-196.69269,10.10384,28.715384,17.270079,19.48369,-132.76807
gt_PC4,201.11601,372.0228,8.361859,-7.992949,-11.427331,-397.1996,61.010986,-6.043996,-17.017483,-11.012879,...,-15.73029,18.946665,4.890167,-19.546944,-481.97995,-94.01307,10.677946,9.076994,1.502355,-639.104
gt_PC5,313.46286,49.229893,-33.705082,25.187931,1074.165,-318.9338,18.0747,-36.364742,0.630368,2.294241,...,12.673084,-371.3634,-3.822149,-9.918574,-821.47046,-3.526762,-35.125046,-8.983247,-26.14677,-85.408226
gt_PC6,916.07495,-182.05522,-64.38575,-1.900083,50.214584,264.8937,-9.483054,-60.09101,-47.462048,-30.207642,...,-49.79958,-577.1243,-56.745644,6.438612,-27.338608,5.925176,16.42505,8.160939,-25.984467,-492.4777
gt_PC7,72.117065,-386.22073,3.560066,4.146451,-168.47487,727.96436,69.76369,-45.26098,3.281757,-9.353193,...,-11.844399,1123.6448,-22.716658,23.455002,-599.69073,-43.594402,-5.632242,-16.682304,-15.710847,156.3713
gt_PC8,-516.169,-129.46304,7.14193,38.281017,-135.52444,1303.6456,-15.064022,25.995516,-40.015465,15.043926,...,18.833464,-458.67288,29.864983,14.72087,-20.858706,-6.027317,-17.070618,-12.322556,27.067438,-407.64465
gt_PC9,559.65314,-532.2907,-27.625359,20.118303,-454.74533,-102.773254,12.667098,-5.670022,-24.788172,-3.889826,...,23.22557,-610.45953,19.842443,-2.595212,-9.215776,17.676556,-4.42947,-31.535606,-25.920094,1240.3632
gt_PC10,-287.536,1099.4442,20.833864,1.073739,157.9394,-210.30444,-44.82611,44.781776,-10.254386,-29.418983,...,30.223679,108.15007,10.092315,-12.110616,533.96716,20.932838,-8.381093,34.299362,18.092451,161.46457


### Nr of insertions per donor

In [17]:
n_frags = adata.obs[['donor_id', 'n_insertions']].set_index('donor_id').T
n_frags

donor_id,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
n_insertions,5368766,11215234,34317158,7233664,12626196,22325368,12465486,7229716,13601230,5397060,...,15180932,6602092,5617938,11468300,25433056,11601442,8360264,10821210,25591292,105719158


### Joint covariates

In [18]:
covs = pd.concat([gt_pcs, phe_pcs, n_frags])
covs = covs.rename_axis(index='id')
covs = covs.round(7)
covs   

Unnamed: 0_level_0,SAMEA2518325,SAMEA2698309,SAMEA3735541,SAMEA2547619,SAMEA2445784,SAMEA2627577,SAMEA2609971,SAMEA2445779,SAMEA2536410,SAMEA2547637,...,SAMEA3853161,SAMEA2536416,SAMEA3964906,SAMEA2658084,SAMEA2609965,SAMEA3962524,SAMEA3973854,SAMEA2627141,SAMEA3448738,SAMEA2613912
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gt_PC1,-424.3459,-587.0873,25.64977,13.56072,1183.772,253.7632,-11.02609,3.230694,-6.109256,50.20628,...,-26.08643,-141.9344,-23.22571,-7.199219,227.2879,12.18627,-8.161153,24.83594,-14.84416,-44.87694
gt_PC2,-333.0083,367.2751,9.594588,11.33428,-210.5209,372.0318,-19.35849,-20.55713,-7.918764,-8.613843,...,12.33765,-963.9279,13.60919,13.41318,292.4741,42.35161,4.912662,39.3655,27.14996,-26.28596
gt_PC3,1047.887,587.0468,-17.56957,32.36879,36.92968,613.6229,30.14374,28.40218,-13.29302,35.81667,...,8.423677,-71.29353,46.57533,46.89587,-196.6927,10.10384,28.71538,17.27008,19.48369,-132.7681
gt_PC4,201.116,372.0228,8.361859,-7.992949,-11.42733,-397.1996,61.01099,-6.043996,-17.01748,-11.01288,...,-15.73029,18.94666,4.890167,-19.54694,-481.9799,-94.01307,10.67795,9.076994,1.502355,-639.104
gt_PC5,313.4629,49.22989,-33.70508,25.18793,1074.165,-318.9338,18.0747,-36.36474,0.6303679,2.294241,...,12.67308,-371.3634,-3.822149,-9.918574,-821.4705,-3.526762,-35.12505,-8.983247,-26.14677,-85.40823
gt_PC6,916.0749,-182.0552,-64.38575,-1.900083,50.21458,264.8937,-9.483054,-60.09101,-47.46205,-30.20764,...,-49.79958,-577.1243,-56.74564,6.438612,-27.33861,5.925176,16.42505,8.160939,-25.98447,-492.4777
gt_PC7,72.11706,-386.2207,3.560066,4.146451,-168.4749,727.9644,69.76369,-45.26098,3.281757,-9.353193,...,-11.8444,1123.645,-22.71666,23.455,-599.6907,-43.5944,-5.632242,-16.6823,-15.71085,156.3713
gt_PC8,-516.169,-129.463,7.14193,38.28102,-135.5244,1303.646,-15.06402,25.99552,-40.01546,15.04393,...,18.83346,-458.6729,29.86498,14.72087,-20.85871,-6.027317,-17.07062,-12.32256,27.06744,-407.6447
gt_PC9,559.6531,-532.2907,-27.62536,20.1183,-454.7453,-102.7733,12.6671,-5.670022,-24.78817,-3.889826,...,23.22557,-610.4595,19.84244,-2.595212,-9.215776,17.67656,-4.42947,-31.53561,-25.92009,1240.363
gt_PC10,-287.536,1099.444,20.83386,1.073739,157.9394,-210.3044,-44.82611,44.78178,-10.25439,-29.41898,...,30.22368,108.1501,10.09231,-12.11062,533.9672,20.93284,-8.381093,34.29936,18.09245,161.4646


In [None]:
if mode == 'bulk-tests':

    covs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/bulk-tests/covariates.tsv'
    create_dir(covs_path)
    covs.to_csv(covs_path, sep='\t')

elif mode == 'single-tests':

    # Map to populate
    # = New peak-specific rows
    peak_donor_ins_map = {peak: {} for peak in peaks_test}
                       # {'peak': {'donor1' : n_insertions, ...}}

    # Populate
    for donor, donor_id in zip(adata.obs['donor'], adata.obs['donor_id']):

        bw = pyBigWig.open(f'{GROUPED_BIGWIG_FILES_DIR}/{cell_type}/{donor}_{cell_type}.bw')

        for peak in peaks_test:

            chr, start, end = peak.split(':')[0:3]

            profile = bw.values(chr, int(start), int(end))
            n_ins = np.nansum(profile).astype(int)
            peak_donor_ins_map[peak][donor_id] = n_ins

        bw.close()


    # Export covs_extended
    for peak in peaks_test:

        covs_extended = covs.copy()

        row_label = f'n_insertions_{peak}'
        covs_extended.loc[row_label] = pd.Series(peak_donor_ins_map[peak])

        covs_extended_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/single-tests/covariates/covariates%{peak}.tsv'
        create_dir(covs_extended_path)
        covs_extended.to_csv(covs_extended_path, sep='\t')

In [None]:
del phe_pcs
del gt_pcs
del covs
gc.collect()

# QTL mapping stats

In [None]:
# Gather stats

stats = adata.obs[['n_cells', 'n_fragments']].describe()

mean_cells_donor, std_cells_donor = stats['n_cells'].to_list()[1:3]
mean_frags_donor, std_frags_donor = stats['n_fragments'].to_list()[1:3]

total_cells = adata.obs['n_cells'].sum()
total_frags = adata.obs['n_fragments'].sum()

In [None]:
stats_out_path = f'{MATRIX_EQTL_OUTPUT_DIR}/qtl_testing_metadata.tsv'

create_dir(stats_out_path)

with open(stats_out_path, 'a') as f:
        
    msg = f'\n{RUN_ID}\t' \
          f'{cell_type}\t' \
          f'{n_donors}\t' \
          f'{n_peaks}\t' \
          f'{total_cells}\t' \
          f'{mean_cells_donor:.1f}\t' \
          f'{std_cells_donor:.1f}\t' \
          f'{total_frags}\t' \
          f'{mean_frags_donor:.1f}\t' \
          f'{std_frags_donor:.1f}\t'

    f.write(msg)
    print(msg)