# Make matrix-eqtl input for caQTLs (bulk-tests)

Inputs:
- a

Outputs:
- a

## Setup

In [None]:
import os
import sys
import gc
from datetime import date
import pandas as pd
import anndata as ad

In [None]:
# Ensure cwd is project root and that /code is in path

import os
import sys

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

### Variables

In [None]:
from helpers.python.utils import create_dir

In [None]:
# User variables

from glob_vars import CT_MAP_ID, \
                      ATAC_CHROM_ACCESS_DIR, \
                      GENOTYPES_PROCESSED_TSV, GENOTYPE_PCS_TSV, SNP_LOCS_BED, \
                      MATRIX_EQTL_INPUT_DIR, MATRIX_EQTL_OUTPUT_DIR, \
                      MAIN_ENV

cell_type = str('DL-EN')

mode = 'bulk-tests' # {bulk-tests, single-tests, peak-tests} Perform all tests in bulk and leverage matrix operations or have fine-tuned control over the individual tests

n_ca_pcs = 5 # Nr of chromatin accessipeaks_testbility PCs to use as covariate
n_gt_pcs = 5 # Nr of genotype PCs to use as covariate

In [None]:
# Variable checks

if not mode:
    
    print('The mode has not been specified {bulk-test, single-test, peak-test}. ABORTING.')
    os._exit(1)

In [None]:
# Variable checks

if mode != 'bulk-tests':
    
    print('The mode has not implemented yet. ABORTING.')
    os._exit(1)

## Load data

In [None]:
# Phenotype 

adata = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_ca-qtls.h5ad')
adata_ref = ad.read_h5ad(f'{ATAC_CHROM_ACCESS_DIR}/adata/subset/{CT_MAP_ID}/{cell_type}/peak_matrix_all-cell-types_cell-lvl.h5ad')


# Port donor_id
# TODO: annotate adata_ca in process_peaks_adata
donor_id_map = adata_ref.obs[['donor', 'donor_id']].groupby('donor')['donor_id'].unique().to_dict()
assert all(len(values) == 1 for values in donor_id_map.values())
donor_id_map = {key: values[0] for key, values in donor_id_map.items()}
adata.obs['donor_id'] = adata.obs.index.map(donor_id_map)

donors = adata.obs['donor'].to_list()
donor_ids = adata.obs['donor_id'].to_list()
peaks = adata.var['peak_name'].to_list()
n_donors, n_peaks = adata.shape

adata

In [None]:
adata.obs[['donor', 'donor_id']]

In [None]:
adata.obs['donor_id']

In [None]:
# Genotype

gt = pd.read_csv(GENOTYPES_PROCESSED_TSV, sep='\t', header=0, index_col=0).rename_axis('snp_id')
gt = gt[donor_ids]
gt

In [None]:
snps = gt.index.to_list()

In [None]:
# Checks
assert len(set(snps)) == len(snps), f'SNP labels not unique: total = {len(snps)}, unique = {len(set(snps))}'

# Matrix eQTL IO

## Genotype

In [None]:
if mode == 'bulk-tests':

    gt_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'chromatin-accessibility', CT_MAP_ID, cell_type, mode, 'genotype_NA.tsv')
    create_dir(gt_path)
    gt.to_csv(gt_path, sep='\t', na_rep='NaN')

# else:

#     # Source
#     gt_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'genotype_NA_source.tsv')
#     create_dir(gt_path)
#     gt.to_csv(gt_path, sep='\t', na_rep='NaN')


# if mode == 'single-tests':

#     for snp in snps_test:

#         # Individual
#         gt_snp_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'genotypes/genotype_NA%{snp}.tsv')
#         create_dir(gt_snp_path)
#         gt.loc[[snp], :].to_csv(gt_snp_path, sep='\t', na_rep='NaN')

# elif mode == 'peak-tests':

#     for peak in peaks_test:

#         # Individual
#         gt_snp_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'genotypes/genotype_NA%{peak}.tsv')
#         create_dir(gt_snp_path)
#         gt.loc[peak_snps_map[peak], :].to_csv(gt_snp_path, sep='\t', na_rep='NaN')

## Phenotype matrix

Performed in select_peaks...

In [None]:
# Create pheno matrix
# donor_id_map = adata.obs['donor_id'].to_dict()

# pheno_out = adata.to_df().T
# pheno_out = pheno_out.rename(columns=donor_id_map)
# pheno_out = pheno_out.rename_axis('peak_id')

# pheno_out

In [None]:
# if mode == 'bulk-tests':

#     pheno_out_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'phenotype.tsv')
#     create_dir(pheno_out_path)
#     pheno_out.round(6).to_csv(pheno_out_path, sep='\t', na_rep='NaN')

# elif any(mode in option for option in ['single-tests', 'peak-tests']):

#     # Source
#     pheno_out_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'phenotype_source.tsv')
#     create_dir(pheno_out_path)
#     pheno_out.round(6).to_csv(pheno_out_path, sep='\t', na_rep='NaN')

#     for peak in peaks_test:

#         # Individual
#         phe_peak_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'phenotypes/phenotype%{peak}.tsv')
#         create_dir(phe_peak_path)
#         pheno_out.round(6).loc[[peak], :].to_csv(phe_peak_path, sep='\t', na_rep='NaN')

In [None]:
# del pheno_out
# gc.collect()

## Peak locations

In [None]:
peak_locs = adata.var[['peak_name', 'chr', 'start', 'end']].copy()

peak_locs['start'] = peak_locs['start'].astype(int) # Coordinate system: 0-based half-closed
peak_locs['end'] = peak_locs['end'].astype(int)

peak_locs

In [None]:
if mode == 'bulk-tests':

    peak_loc_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'chromatin-accessibility', CT_MAP_ID, cell_type, mode, 'peak_location.tsv')
    create_dir(peak_loc_path)
    peak_locs.to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

# elif any(mode in option for option in ['single-tests', 'peak-tests']):

#     # Source
#     peak_loc_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'peak_location_source.tsv')
#     create_dir(peak_loc_path)
#     peak_locs.to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

#     for peak in peaks_test:

#         # Individual
#         peak_loc_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'peak_locations/peak_location%{peak}.tsv')
#         create_dir(peak_loc_path)
#         peak_locs.round(6).loc[[peak], :].to_csv(peak_loc_path, sep='\t', index=False, na_rep='NaN')

In [None]:
del peak_locs
gc.collect()

## SNP locations

In [None]:
snp_locs = gt.index.to_frame()
snp_locs['chr'] = snp_locs.index.str.split('_').str[0]
snp_locs['pos'] = snp_locs.index.str.split('_').str[1].astype(int) - 1  # 1-based fully closed (snp) -> 0-based half-open (downstream, matrix-eQTL)
snp_locs

In [None]:
if mode == 'bulk-tests':

    snp_locs_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'chromatin-accessibility', CT_MAP_ID, cell_type, mode, 'snp_location.tsv')
    create_dir(snp_locs_path)
    snp_locs.to_csv(snp_locs_path, sep='\t', index=False, na_rep='NaN')

# elif any(mode in option for option in ['single-tests', 'peak-tests']):

#     # Source
#     snp_loc_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'snp_location_source.tsv')
#     create_dir(snp_loc_path)
#     snp_locs.to_csv(snp_loc_path, sep='\t', index=False, na_rep='NaN')

#     for peak in peaks_test:

#         # Individual
#         snp_loc_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'snp_locations/snp_location%{peak}.tsv')
#         create_dir(snp_loc_path)
#         snp_locs.round(6).loc[peak_snps_map[peak], :].to_csv(snp_loc_path, sep='\t', index=False, na_rep='NaN')

In [None]:
del gt
del snp_locs
gc.collect()

## Covariates

### Phenotype PCs

In [None]:
phe_pcs = pd.DataFrame(adata.obsm['X_pca'][:, 0:n_ca_pcs]).set_index(adata.obs['donor_id'])

phe_pcs.columns = [f'phe_PC_{str(i)}' for i in range(1, len(phe_pcs.columns) + 1)]
phe_pcs = phe_pcs.T
phe_pcs = phe_pcs.rename_axis('id')

phe_pcs

### Genotype PCs

In [None]:
gt_pcs = pd.read_csv(GENOTYPE_PCS_TSV, sep="\t", index_col=0)

gt_pcs = gt_pcs.T
gt_pcs = gt_pcs[0:n_gt_pcs]
gt_pcs = gt_pcs.rename_axis('id').rename(index={idx: f'gt_{idx}' for idx in gt_pcs.index})

gt_pcs = gt_pcs[donor_ids]

gt_pcs

### Joint covariates

In [None]:
covs = pd.concat([gt_pcs, phe_pcs])
covs = covs.rename_axis(index='id')
covs = covs.round(7)
covs   

In [None]:
if mode == 'bulk-tests':

    covs_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'chromatin-accessibility', CT_MAP_ID, cell_type, mode, 'covariates.tsv')
    create_dir(covs_path)
    covs.to_csv(covs_path, sep='\t', na_rep='NaN')

# elif any(mode in option for option in ['single-tests', 'peak-tests']):

#     # Populate peak-donor-insertions map
#     #   = extended peak-specific covariate rows

#     peak_donor_ins_map = {peak: {} for peak in peaks_test}
#                        # {'peak': {'donor1' : n_insertions, ...}}

    
#     for donor, donor_id in zip(adata.obs['donor'], adata.obs['donor_id']):

#         bw = pyBigWig.open(f'{GROUPED_BIGWIG_FILES_DIR}/{cell_type}/{donor}.bw')

#         for peak in peaks_test:

#             chr, start, end = peak.split(':')[0:3]

#             profile = bw.values(chr, int(start), int(end))
#             n_ins = np.nansum(profile).astype(int)
#             peak_donor_ins_map[peak][donor_id] = n_ins

#         bw.close()


#     # For CBPNet populate new map snp: donors: score and also extend covs in next section


#     # Export covs_extended

#     # Source
#     covs_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, 'covariates_source.tsv')
#     create_dir(covs_path)
#     covs.to_csv(covs_path, sep='\t', na_rep='NaN')

#     for peak in peaks_test:

#         covs_extended = covs.copy()

#         row_label = f'n_insertions_{peak}'
#         covs_extended.loc[row_label] = pd.Series(peak_donor_ins_map[peak])

#         # Individual
#         covs_extended_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'footprints', category, mode, f'covariates/covariates%{peak}.tsv')
#         create_dir(covs_extended_path)
#         covs_extended.to_csv(covs_extended_path, sep='\t', na_rep='NaN')


        

In [None]:
del phe_pcs
del gt_pcs
del covs
gc.collect()

# QTL mapping stats

In [None]:
# Gather stats

category = f'{CT_MAP_ID}/{cell_type}/{mode}'

adata.obs['n_cells'] = adata.obs.index.map(adata_ref.obs.groupby('donor').size().to_dict())

stats = adata.obs[['n_cells']].describe()
# stats = adata.obs[['n_cells', 'n_fragments']].describe()

mean_cells_donor, std_cells_donor = stats['n_cells'].to_list()[1:3]
mean_frags_donor, std_frags_donor = ('NaN', 'NaN') # stats['n_fragments'].to_list()[1:3]

total_cells = adata.obs['n_cells'].sum()
total_frags = 'NaN' # adata.obs['n_fragments'].sum()

In [None]:
stats_out_path = os.path.join(MATRIX_EQTL_INPUT_DIR, 'chromatin-accessibility', CT_MAP_ID, cell_type, mode, 'qtl_testing_metadata.tsv')

create_dir(stats_out_path)

with open(stats_out_path, 'a') as f:

    msg = f'{date.today().strftime("%Y-%m-%d")}\t'
    msg += f'{category}\t' 
    msg += f'{cell_type}\t' 
    msg += f'{n_donors}\t' 
    msg += f'{n_peaks}\t' 
    msg += f'{total_cells}\t' 
    msg += f'{mean_cells_donor:.1f}\t' 
    msg += f'{std_cells_donor:.1f}\t' 
    msg += f'{total_frags}\t'
    msg += f'{mean_frags_donor}\t' 
    msg += f'{std_frags_donor}\n'

    f.write(msg)
    print(msg)