# SGL Tracker

This tracker is focused on the rule **run_colocalization_eqtl_catalog** which produces:

**output:**<br>
    final = protected('results/main/coloc/Results/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed')

**log:**<br>
    'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.{gwas_source}.{eqtl_source}.{ge_source}.log'

In [149]:
import os
import pandas as pd 
import numpy as np
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [150]:
# load the colocalization samplesheet
samplesheet = pd.read_table('config/sgl_samplesheets/sgl.samplesheet.tsv', comment='#')

In [151]:
(samplesheet['eqtl_db']).value_counts()

ImmuNexUT         96
Schmiedel_2018    60
Quach_2016        16
BLUEPRINT          8
GENCORD            4
Name: eqtl_db, dtype: int64

In [152]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map
0,T1D_34012112_Gaulton,BLUEPRINT,monocyte,monocyte_naive,Y
1,T1D_34012112_Gaulton,Quach_2016,monocyte_IAV,monocyte_naive,M
2,T1D_34012112_Gaulton,Quach_2016,monocyte_LPS,monocyte_naive,M
3,T1D_34012112_Gaulton,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,M
4,T1D_34012112_Gaulton,Quach_2016,monocyte_R848,monocyte_naive,M
5,T1D_34012112_Gaulton,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive,M
6,T1D_34012112_Gaulton,Schmiedel_2018,monocyte_naive,monocyte_naive,Y
7,T1D_34012112_Gaulton,ImmuNexUT,CD16p_Mono,monocyte_naive,Y
8,T1D_34012112_Gaulton,ImmuNexUT,CL_Mono,monocyte_naive,Y
9,T1D_34012112_Gaulton,ImmuNexUT,Int_Mono,monocyte_naive,Y


In [153]:
# check each file in the samplesheet for a log and final file
new_data_cols = []

# input file templates
eqtl_tpl = 'results/main/GRCh37/sgls/{eqtl_db}/{gwas_source}/{eqtl_source}/{ge_source}/eqtls.coloc_filtered.tsv.gz'
coloc_tpl = 'results/main/GRCh37/coloc/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/ldpairs/coloc_ld_snps.txt'
loops_tpl = 'results/main/h3k27ac_hichip/{loop_source}/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed'

# output file templates 
outdir_tpl = 'results/main/GRCh37/sgls/ldpairs/{eqtl_db}/{gwas_source}/{eqtl_source}/{ge_source}/{loop_source}/script_version/'

for i, sr in samplesheet.iterrows():

    # get the label for the main source 
    if sr.eqtl_db == 'ImmuNexUT':
        main_source = 'ImmuNexUT'
    else:
        main_source = 'eQTL_Catalogue'
        
    # extract source data location
    d = {'eqtl_db': sr.eqtl_db,
            'gwas_source': sr.gwas,
            'eqtl_source': sr.eqtl_db,
            'ge_source': sr.eqtl_origin,
            'loop_source': sr.loop_origin,
            'main_source': main_source}

    # path the input files
    eqtl_path = eqtl_tpl.format(**d)
    coloc_path = coloc_tpl.format(**d)
    loops_path = loops_tpl.format(**d)
    
    # path the output files
    outdir_path = outdir_tpl.format(**d)

    # determine the presence of inputs and outputs
    # using an integer rather that boolean 
    has_eqtl = int(os.path.exists(eqtl_path))
    has_coloc = int(os.path.exists(coloc_path))
    has_loops = int(os.path.exists(loops_path))
    has_outdir = int(os.path.exists(outdir_path))

    # checking the logs (not ready)
    #     log = log_tpl.format(**d)

    #     if os.path.exists(output):
    #         status = 'Complete'

    #     elif os.path.exists(log):
    #         status = 'Zero SGLs'

    #     else:
    #         status = 'Not Run'
        
    # append the main source and status to the new data cols
    new_data_cols.append([has_eqtl,
                        has_coloc,
                        has_loops,
                        has_outdir, 
                        eqtl_path,
                        coloc_path, 
                        loops_path,
                        outdir_path])

In [154]:
status_df = pd.DataFrame(new_data_cols)
status_df.columns = ['has_eqtl', 'has_coloc', 'has_loops', 'has_outdir',
                     'eqtl_path', 'coloc_path', 'loops_path', 'outdir_path']
samplesheet = pd.concat([samplesheet, status_df], axis=1)

In [161]:
samplesheet.loc[(samplesheet.has_outdir == 1)]

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map,has_eqtl,has_coloc,has_loops,has_outdir,eqtl_path,coloc_path,loops_path,outdir_path
20,T1D_25751624,ImmuNexUT,Int_Mono,monocyte_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
41,T1D_32005708,ImmuNexUT,CL_Mono,monocyte_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
42,T1D_32005708,ImmuNexUT,Int_Mono,monocyte_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
43,T1D_32005708,ImmuNexUT,NC_Mono,monocyte_naive,M,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
47,T1D_25751624,ImmuNexUT,NK,NK-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/NK-cell_naive/FitH...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
51,T1D_32005708,ImmuNexUT,NK,NK-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/NK-cell_naive/FitH...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
53,T1D_25751624,ImmuNexUT,Naive_B,B-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/B-cell_naive/FitHi...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
55,T1D_25751624,ImmuNexUT,SM_B,B-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/B-cell_naive/FitHi...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
56,T1D_25751624,ImmuNexUT,DN_B,B-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/B-cell_naive/FitHi...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...
59,T1D_32005708,ImmuNexUT,Naive_B,B-cell_naive,Y,1,1,1,1,results/main/GRCh37/sgls/ImmuNexUT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/B-cell_naive/FitHi...,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...


In [163]:
samplesheet.loc[(samplesheet.has_outdir == 0)].sort_values('eqtl_db')

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map,has_eqtl,has_coloc,has_loops,has_outdir,eqtl_path,coloc_path,loops_path,outdir_path
0,T1D_34012112_Gaulton,BLUEPRINT,monocyte,monocyte_naive,Y,0,0,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3401211...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
22,T1D_34594039_GCST90018925,BLUEPRINT,monocyte,monocyte_naive,Y,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3459403...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
33,T1D_32005708,BLUEPRINT,monocyte,monocyte_naive,Y,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
116,T1D_34012112_Gaulton,BLUEPRINT,T-cell,CD4_T-cell_naive,M,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3401211...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
136,T1D_34594039_GCST90018925,BLUEPRINT,T-cell,CD4_T-cell_naive,M,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3459403...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
11,T1D_25751624,BLUEPRINT,monocyte,monocyte_naive,Y,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/monocyte_naive/Fit...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
76,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive,M,0,1,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_2575162...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
96,T1D_32005708,BLUEPRINT,T-cell,CD4_T-cell_naive,M,0,0,1,0,results/main/GRCh37/sgls/BLUEPRINT/T1D_3200570...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/BLUEPRINT/T1D...
77,T1D_25751624,GENCORD,T-cell,CD4_T-cell_naive,M,0,1,1,0,results/main/GRCh37/sgls/GENCORD/T1D_25751624/...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_2...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/GENCORD/T1D_2...
97,T1D_32005708,GENCORD,T-cell,CD4_T-cell_naive,M,0,1,1,0,results/main/GRCh37/sgls/GENCORD/T1D_32005708/...,results/main/GRCh37/coloc/eQTL_Catalogue/T1D_3...,results/main/h3k27ac_hichip/CD4_T-cell_naive/F...,results/main/GRCh37/sgls/ldpairs/GENCORD/T1D_3...
