# Setup

In [None]:
import os
import sys
import json
import gc
from datetime import date
import numpy as np
import pandas as pd
import anndata as ad

In [None]:
# cwd -> project root 
# /code in path

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [None]:
from helpers.python.utils import create_dir, ct_format, ct_format_alt, parse_vcf
from helpers.python.anndata_utils import check_anndata

## Variables

In [None]:
# User variables

from glob_vars import FOOTPRINTS_DIR, DATASET, CT_MAP_JSON, CT_MAP_ID, GENOTYPES_VCF, GENOTYPES_TSV, GENOTYPE_PCS_TSV, MATRIX_EQTL_INPUT_DIR, MATRIX_EQTL_OUTPUT_DIR

cell_type = str("DL-EN")
RUN_ID = 'meqtl_io_' + f"{date.today().strftime('%Y-%m-%d')}_{CT_MAP_ID}_{DATASET}"

n_ca_PCs = 2 # Nr of chromatin accessibility PCs used as covariate
min_cells_donor = 200 # This should be 0 since this filter should be applied before footprint computation

In [None]:
cell_type_alt = ct_format_alt(cell_type)
cell_type = ct_format(cell_type)

In [None]:
# Get cell-type grouping map

with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

ct_map_alt = {ct_format_alt(key): [ct_format_alt(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i_alt = {ct_format_alt(old_ct): ct_format_alt(new_ct) for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

ct_map = {ct_format(key): [ct_format(e) for e in listt] for key, listt in ct_map.items()}
ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

# Process adata

In [None]:
adata = ad.read_h5ad(os.path.join(FOOTPRINTS_DIR, 'footprints_' + cell_type + '_processed.h5ad'))
donors = adata.obs.index.to_list()
donor_ids = adata.obs['donor_id'].to_list()
peaks = adata.var.index.to_list()
n_donors, n_peaks = adata.shape

adata

In [None]:
# Checks

obs_criteria_kwargs = [
    {'col': 'n_cells', 'func': lambda x: (x > min_cells_donor).all()}
    ]

check_anndata(adata, min_obs=20, obs_criteria_kwargs=obs_criteria_kwargs)

# Matrix eQTL IO

## Phenotype matrix

In [None]:
# Create pheno matrix
donor_id_map = adata.obs['donor_id'].to_dict()

pheno_out = adata.to_df().T
pheno_out= pheno_out.rename(columns=donor_id_map)
pheno_out = pheno_out.rename_axis('peak_id')

pheno_out

In [None]:
# Export peak chromatin accessibility
pheno_out_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/footprints.tsv'

create_dir(pheno_out_path)

pheno_out.round(6).to_csv(pheno_out_path, sep='\t')

In [None]:
del pheno_out
gc.collect()

## COV: peak locations

In [None]:
peak_locs = adata.var[['peak_name', 'chr', 'start', 'end']].copy()

peak_locs['start'] = peak_locs['start'].astype(int)
peak_locs['end'] = peak_locs['end'].astype(int)

peak_locs['start'] = peak_locs['start'] - 1 # Peak naming is in 1-based fully closed notation. Format to 0-based half-open notation which is what is used in the genotype matri

peak_locs

In [None]:
peak_locs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/peak_locations.tsv'

create_dir(peak_locs_path)

peak_locs.to_csv(peak_locs_path, sep='\t', index=False)

In [None]:
del peak_locs
gc.collect()

## COV: CA PCs

In [None]:
phe_pcs = pd.DataFrame(adata.obsm['X_pca'][:, 0:n_ca_PCs]).set_index(adata.obs['donor_id'])

phe_pcs.columns = [f'phe_PC_{str(i)}' for i in range(1, len(phe_pcs.columns) + 1)]
phe_pcs = phe_pcs.T
phe_pcs = phe_pcs.rename_axis('id')

phe_pcs

In [None]:
phe_pcs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/footprint_pcs.tsv'

create_dir(phe_pcs_path)

phe_pcs.to_csv(phe_pcs_path, sep='\t')

In [None]:
del phe_pcs
gc.collect()

## COV: Genotype PCs

In [None]:
gt_pcs = pd.read_csv(GENOTYPE_PCS_TSV, sep="\t", index_col=0)

gt_pcs = gt_pcs.T
gt_pcs = gt_pcs.rename_axis('id').rename(index={idx: f'geno_{idx}' for idx in gt_pcs.index})

gt_pcs = gt_pcs[donor_ids]

gt_pcs

In [None]:
gt_pcs_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/genotype_pcs.tsv'

create_dir(gt_pcs_path)

gt_pcs.to_csv(gt_pcs_path, sep='\t')

In [None]:
del gt_pcs
gc.collect()

## Genotype

In [None]:
gt = pd.read_csv(GENOTYPES_TSV, sep='\t', header=0, index_col=0).rename_axis('snp_id')
gt = gt.rename(index=lambda x: 'chr' + str(x))
gt = gt.loc[gt.index.drop_duplicates(), :]
gt = gt[donor_ids]
gt

In [None]:
snps = gt.index.to_list()

In [None]:
gt_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/genotype_NA.tsv'

create_dir(gt_path)

gt.to_csv(gt_path, sep='\t', na_rep='NaN')

In [None]:
del gt
gc.collect()

## SNP locations

In [None]:
snp_locs = parse_vcf(GENOTYPES_VCF)
snp_locs = snp_locs[['ID', 'CHROM', 'POS']].rename(columns={'ID': 'snp_id', 'CHROM': 'chr', 'POS': 'pos'})
snp_locs = snp_locs.set_index('snp_id')

In [None]:
snp_locs = snp_locs.loc[snps, :].groupby(level=0).first() # There are repeated indexes in snps. Take the first one
snp_locs = snps_locs.reindex(snps)
snp_locs

In [None]:
snps_path = f'{MATRIX_EQTL_INPUT_DIR}/{cell_type}/snp_locations.tsv'

create_dir(snps_path)

snp_locs.to_csv(snps_path, sep='\t')

In [None]:
del snp_locs
gc.collect()

# QTL mapping stats

In [None]:
# Gather stats

stats = adata.obs[['n_cells', 'n_frags']].describe()

mean_cells_donor, std_cells_donor = stats['n_cells'].to_list()[1:3]
mean_frags_donor, std_frags_donor = stats['n_frags'].to_list()[1:3]

total_cells = adata.obs['n_cells'].sum()
total_frags = adata.obs['n_frags'].sum()

In [None]:
stats_out_path = f'{MATRIX_EQTL_OUTPUT_DIR}/qtl_testing_metadata.tsv'

create_dir(stats_out_path)

with open(stats_out_path, 'a') as f:
        
    msg = f'\n{RUN_ID}\t' \
          f'{cell_type}\t' \
          f'{n_donors}\t' \
          f'{n_peaks}\t' \
          f'{total_cells}\t' \
          f'{mean_cells_donor:.1f}\t' \
          f'{std_cells_donor:.1f}\t' \
          f'{total_frags}\t' \
          f'{mean_frags_donor:.1f}\t' \
          f'{std_frags_donor:.1f}\t'

    f.write(msg)
    print(msg)