# Setup

In [13]:
import json
import numpy as np
import pandas as pd

In [14]:
# Ensure cwd is project root and that /code is in path

import os
import sys

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [15]:
from helpers.python.utils import ct_format, ct_format_alt, create_dir

## Variables

In [16]:
#User

from glob_vars import GENOTYPES_PROCESSED_TSV, SNP_LOCS_BED, PRECOMPUTED_EQTLS_TSV, CT_MAP_JSON, CT_MAP_ID, EQTLS_DIR

In [17]:
# Get ct-ann --> grouped-ct mappings
with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

In [18]:
# Format

# ct_map = {ct_format(new_ct): [ct_format(old_ct) for old_ct in old_cts] for new_ct, old_cts in ct_map.items()}
# ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

ct_map_alt = {ct_format(new_ct): [ct_format_alt(old_ct) for old_ct in old_cts] for new_ct, old_cts in ct_map.items()}
ct_map_i_alt = {old_ct: new_ct for new_ct, old_cts in ct_map_alt.items() for old_ct in old_cts}

## Genotype tsv

In [19]:
gt = pd.read_csv(GENOTYPES_PROCESSED_TSV, sep='\t', header=0, index_col=0)
gt

Unnamed: 0_level_0,SAMEA2536416,SAMEA3854313,SAMEA3853161,SAMEA3966392,SAMEA2625592,SAMEA2698309,SAMEA3962624,SAMEA3965165,SAMEA2595426,SAMEA3962389,...,SAMEA2555017,SAMEA2420640,SAMEA4451109,SAMEA3448738,SAMEA2627141,SAMEA3485958,SAMEA2536413,SAMEA2464819,SAMEA2627567,SAMEA2645814
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr10_100000235_C_T,2.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,...,2.0,2.0,2.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0
chr10_100000943_G_A,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0
chr10_100000979_T_C,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
chr10_100002628_A_C,1.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,...,2.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0
chr10_100002875_A_G,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr9_9999539_A_G,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
chr9_99997250_C_T,2.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,2.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0
chr9_99998141_T_C,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0
chr9_99998283_G_C,2.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,2.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0


### snp_locations.bed

In [20]:
snps_bed = gt.index.to_frame().reset_index(drop=True)
snps_bed['contig'] = snps_bed['snp'].str.split('_').str[0].astype(str)

# Assumed only chroms in contigs
chroms = list(['chr' + str(i) for i in range(1,24)] + ['X', 'Y'])
assert np.isin(snps_bed['contig'].unique(), chroms).all()

# Format bed
snps_bed['chr'] = snps_bed['contig']
snps_bed['start'] = snps_bed['snp'].str.split('_').str[1].astype(int) - 1 # 1-based fully closed (GT matrix) to 0-based half open (bed tools and matrix-eQTL)
snps_bed['end'] = snps_bed['snp'].str.split('_').str[1]

snps_bed = snps_bed[['chr', 'start', 'end', 'snp']].sort_values(['chr', 'start'], ascending=[True, True])

# Save bed
snps_bed.to_csv(SNP_LOCS_BED, sep='\t', header=False, index=False)

## Pre-computed eQTLs

### Make eQTLs bed file

In [9]:
eqtls = pd.read_csv(PRECOMPUTED_EQTLS_TSV,
                   sep='\t',
                   header=0,
                   index_col=21)
eqtls

Unnamed: 0_level_0,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,gene_name,n_samples,...,snp_position,assessed_allele,call_rate,maf,hwe_p,feature_id,global_corrected_pValue,global_corrected_pValue_BH,global_corrected_pValue_BF,celltype
QTL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000269376-chr13_112983450_T_C,chr13_112983450_T_C,0.000066,-0.364427,0.092665,0.000000e+00,13,113009671,113010319,,277,...,112983450,C,1.0,0.044776,1.000000,ENSG00000269376,0.000000e+00,0.000000e+00,0.000000e+00,immature_EN
ENSG00000183463-chr13_27824764_C_T,chr13_27824764_C_T,0.047441,-0.089739,0.045430,0.000000e+00,13,27977717,27988693,URAD,277,...,27824764,C,1.0,0.089552,0.073438,ENSG00000183463,0.000000e+00,0.000000e+00,0.000000e+00,immature_EN
ENSG00000259182-chr15_101221494_A_C,chr15_101221494_A_C,0.202509,-0.088910,0.069866,0.000000e+00,15,101168530,101170821,,277,...,101221494,A,1.0,0.044776,0.110206,ENSG00000259182,0.000000e+00,0.000000e+00,0.000000e+00,immature_EN
ENSG00000286922-chr12_127567286_A_G,chr12_127567286_A_G,0.265404,-0.030918,0.027792,0.000000e+00,12,127486938,127533242,,277,...,127567286,G,1.0,0.074627,1.000000,ENSG00000286922,0.000000e+00,0.000000e+00,0.000000e+00,immature_EN
ENSG00000286246-chr12_129375924_C_G,chr12_129375924_C_G,0.949835,-0.000101,0.001608,4.101945e-163,12,129622929,129625366,,277,...,129375924,C,1.0,0.089552,0.073438,ENSG00000286246,1.930191e-161,2.223254e-161,1.111627e-160,immature_EN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000250842-chr5_145190494_A_T,chr5_145190494_A_T,0.000033,0.535958,0.130797,1.828702e-02,5,145337932,145381670,,310,...,145190494,A,1.0,0.316176,1.000000,ENSG00000250842,4.610052e-02,4.610052e-02,1.000000e+00,Differentiating_RG
ENSG00000267537-chr19_28873752_G_A,chr19_28873752_G_A,0.000285,-0.620958,0.172965,1.854836e-02,19,28602379,28648303,,310,...,28873752,G,1.0,0.066176,0.246147,ENSG00000267537,4.669823e-02,4.669823e-02,1.000000e+00,Differentiating_RG
ENSG00000287056-chr6_168100952_C_G,chr6_168100952_C_G,0.001926,-0.216988,0.070509,1.932144e-02,6,168217032,168220262,,310,...,168100952,C,1.0,0.102941,1.000000,ENSG00000287056,4.858106e-02,4.858106e-02,1.000000e+00,Differentiating_RG
ENSG00000144130-chr2_112830976_A_G,chr2_112830976_A_G,0.017801,-0.395662,0.167722,1.944629e-02,2,112721020,112742879,NT5DC4,310,...,112830976,G,1.0,0.073529,1.000000,ENSG00000144130,4.883123e-02,4.883123e-02,1.000000e+00,Differentiating_RG


In [10]:
eqtls['celltype'].unique()

array(['immature_EN', 'Stressed_Prog', 'IPC', 'Glia', 'UL-EN',
       'Midbrain_EN', 'Dividing_Glia', 'DL-EN', 'Interneurons',
       'Stressed_Neurons', 'Differentiating_RG'], dtype=object)

In [11]:
# Format df
eqtls_bed = eqtls[['snp_chromosome', 'snp_position', 'snp_id', 'celltype']].copy()
eqtls_bed['chr'] = 'chr' + eqtls_bed['snp_chromosome'].astype(str)
eqtls_bed['start'] = eqtls_bed['snp_position'] - 1 # Make index 0-based open
eqtls_bed['cell_type_custom'] = eqtls['celltype'].map(ct_map_i_alt)
eqtls_bed = eqtls_bed.rename(columns={'snp_position': 'end', 'snp_id': 'id'})

# Filter out 'Discard' cell-types

eqtls_bed = eqtls_bed[~(eqtls_bed['cell_type_custom'] == 'Discard')].copy()

# Save bed for all cell-types
eqtls_bed_all_cts = eqtls_bed[['chr', 'start', 'end', 'id']].sort_values(by=['chr', 'start'], ascending=[True, True])

eqtls_bed_all_cts_path = f'{EQTLS_DIR}/{CT_MAP_ID}/eqtls_all_cell-types.bed'
create_dir(eqtls_bed_all_cts_path)
eqtls_bed_all_cts.to_csv(eqtls_bed_all_cts_path, sep='\t', header=False, index=False)
eqtls_bed_all_cts

# Save bed for individual cell-types
for ct in eqtls_bed['cell_type_custom'].unique():

    eqtls_bed_ct = eqtls_bed[eqtls_bed['cell_type_custom'] == ct].copy()
    eqtls_bed_ct = eqtls_bed_ct[['chr', 'start', 'end', 'id']].sort_values(by=['chr', 'start'], ascending=[True, True])

    eqtls_bed_ct_path = f'{EQTLS_DIR}/{CT_MAP_ID}/{ct}/eqtls.bed'
    create_dir(eqtls_bed_ct_path)
    eqtls_bed_ct.to_csv(eqtls_bed_ct_path, sep='\t', header=False, index=False)
    
eqtls_bed

Unnamed: 0_level_0,snp_chromosome,end,id,celltype,chr,start,cell_type_custom
QTL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000269376-chr13_112983450_T_C,13,112983450,chr13_112983450_T_C,immature_EN,chr13,112983449,Neural-progenitors
ENSG00000183463-chr13_27824764_C_T,13,27824764,chr13_27824764_C_T,immature_EN,chr13,27824763,Neural-progenitors
ENSG00000259182-chr15_101221494_A_C,15,101221494,chr15_101221494_A_C,immature_EN,chr15,101221493,Neural-progenitors
ENSG00000286922-chr12_127567286_A_G,12,127567286,chr12_127567286_A_G,immature_EN,chr12,127567285,Neural-progenitors
ENSG00000286246-chr12_129375924_C_G,12,129375924,chr12_129375924_C_G,immature_EN,chr12,129375923,Neural-progenitors
...,...,...,...,...,...,...,...
ENSG00000250842-chr5_145190494_A_T,5,145190494,chr5_145190494_A_T,Differentiating_RG,chr5,145190493,Glia
ENSG00000267537-chr19_28873752_G_A,19,28873752,chr19_28873752_G_A,Differentiating_RG,chr19,28873751,Glia
ENSG00000287056-chr6_168100952_C_G,6,168100952,chr6_168100952_C_G,Differentiating_RG,chr6,168100951,Glia
ENSG00000144130-chr2_112830976_A_G,2,112830976,chr2_112830976_A_G,Differentiating_RG,chr2,112830975,Glia
