# SGL Tracker

This tracker is focused on the rule **run_colocalization_eqtl_catalog** which produces:

**output:**<br>
    final = protected('results/main/coloc/Results/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed')

**log:**<br>
    'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.{gwas_source}.{eqtl_source}.{ge_source}.log'

In [32]:
import os
import pandas as pd 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pd.set_option('display.max_rows', 500)

In [33]:
# load the colocalization samplesheet
samplesheet = pd.read_table('config/sgl_samplesheets/sgl.samplesheet.tsv', comment='#')

In [34]:
(samplesheet['eqtl_db']).value_counts()

ImmuNexUT         96
Schmiedel_2018    60
Quach_2016        16
BLUEPRINT          8
GENCORD            4
Name: eqtl_db, dtype: int64

In [35]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map
0,T1D_34012112_Gaulton,BLUEPRINT,monocyte,monocyte_naive,Y
1,T1D_34012112_Gaulton,Quach_2016,monocyte_IAV,monocyte_naive,M
2,T1D_34012112_Gaulton,Quach_2016,monocyte_LPS,monocyte_naive,M
3,T1D_34012112_Gaulton,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,M
4,T1D_34012112_Gaulton,Quach_2016,monocyte_R848,monocyte_naive,M
5,T1D_34012112_Gaulton,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive,M
6,T1D_34012112_Gaulton,Schmiedel_2018,monocyte_naive,monocyte_naive,Y
7,T1D_34012112_Gaulton,ImmuNexUT,CD16p_Mono,monocyte_naive,Y
8,T1D_34012112_Gaulton,ImmuNexUT,CL_Mono,monocyte_naive,Y
9,T1D_34012112_Gaulton,ImmuNexUT,Int_Mono,monocyte_naive,Y


In [51]:
# check each file in the samplesheet for a log and final file
new_data_cols = []
coloc_tpl = 'results/main/coloc/Results/{main_source}/{gwas_source}/{eqtl_source}/{ge_source}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
output_tpl = 'results/main/GRCh37/sgls/ldpairs/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/{loop_source}/script_version/'
log_tpl = 'results/main/sgls/logs/annotate_colocs.{gwas_source}.{eqtl_source}.{ge_source}.{loop_source}.log'
for i, sr in samplesheet.iterrows():
    
    # get the label for the main source 
    if sr.eqtl_db == 'ImmuNexUT':
        main_source = 'ImmuNexUT'
    else:
        main_source = 'eQTL_Catalogue'
        
    # extract source data location
    d = {'eqtl_db': sr.eqtl_db,
         'gwas_source': sr.gwas,
         'eqtl_source': sr.eqtl_db,
         'ge_source': sr.eqtl_origin,
         'loop_source': sr.loop_origin,
         'main_source': main_source}
    
    # identify the presence of coloc data
    coloc = coloc_tpl.format(**d)
    if os.path.exists(coloc):
        has_coloc = 1
    else:
        has_coloc = 0

    # identify the status using the presence of output ana log files
    output = output_tpl.format(**d)
    log = log_tpl.format(**d)
    
    if os.path.exists(output):
        status = 'Complete'
        
    elif os.path.exists(log):
        status = 'Zero SGLs'
        
    else:
        status = 'Not Run'
        
    # append the main source and status to the new data cols
    new_data_cols.append([status, has_coloc, main_source, output, log])
    

In [52]:
output

'results/main/GRCh37/sgls/ldpairs/eQTL_Catalogue/T1D_34594039_GCST90018925/Schmiedel_2018/monocyte_CD16_naive/monocyte_naive/script_version/'

In [53]:
samplesheet['status'], samplesheet['has_coloc'], samplesheet['main_source'], \
    samplesheet['output'], samplesheet['log'] = zip(*new_data_cols)
samplesheet.sort_values(['main_source', 'status', 'has_coloc', 'gwas', 'eqtl_db', 'eqtl_origin'], inplace=True)

In [54]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map,status,has_coloc,main_source,output,log
94,T1D_25751624,ImmuNexUT,Naive_CD4,CD4_T-cell_naive,Y,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_257...
29,T1D_34594039_GCST90018925,ImmuNexUT,CD16p_Mono,monocyte_naive,Y,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
181,T1D_34594039_GCST90018925,ImmuNexUT,CM_CD8,CD8_T-cell_naive,M,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
74,T1D_34594039_GCST90018925,ImmuNexUT,DN_B,B-cell_naive,Y,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
75,T1D_34594039_GCST90018925,ImmuNexUT,DN_B,Plasmablast,Y,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
182,T1D_34594039_GCST90018925,ImmuNexUT,EM_CD8,CD8_T-cell_naive,M,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
151,T1D_34594039_GCST90018925,ImmuNexUT,Fr_III_T,CD4_T-cell_naive,M,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
152,T1D_34594039_GCST90018925,ImmuNexUT,Fr_II_eTreg,CD4_T-cell_naive,M,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
153,T1D_34594039_GCST90018925,ImmuNexUT,Fr_I_nTreg,CD4_T-cell_naive,M,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...
31,T1D_34594039_GCST90018925,ImmuNexUT,Int_Mono,monocyte_naive,Y,Not Run,0,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/eQTL_Catalogu...,results/main/sgls/logs/annotate_colocs.T1D_345...


## Checking samples that completed correctly (at least one SGL)

In [57]:
good_sgls = samplesheet.loc[samplesheet.status == 'Complete']

In [59]:
good_sgls.shape

(43, 10)

## Identifying samples that need to be run

**Run output for eQTL Catalogue**

In [60]:
samplesheet.loc[(samplesheet.main_source == 'eQTL_Catalogue')].shape

(88, 10)

In [61]:
samplesheet.loc[(samplesheet.main_source == 'eQTL_Catalogue')].status.value_counts()

Complete     43
Zero SGLs    25
Not Run      20
Name: status, dtype: int64

In [8]:
run_samples = samplesheet.loc[(samplesheet.has_coloc == 1) & \
                              (samplesheet.status == 'Not Run') & \
                              (samplesheet.main_source == 'eQTL_Catalogue')]
run_output_list = run_samples.output.tolist()
run_output_str = ' '.join(run_output_list)
run_output_str

''

In [9]:
run_output_list[0]

IndexError: list index out of range

In [None]:
run_samples

**Run output for ImmuNexUT**

In [17]:
run_samples = samplesheet.loc[(samplesheet.has_coloc == 1) & \
                              (samplesheet.status == 'Not Run') & \
                              (samplesheet.main_source == 'ImmuNexUT')]
run_output_list = run_samples.output.tolist()
run_output_str = ' '.join(run_output_list)

In [18]:
run_output_list[0]

'results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/CD16p_Mono/monocyte_naive/script_version/'

In [19]:
run_output_str

'results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/CD16p_Mono/monocyte_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/CL_Mono/monocyte_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/CM_CD8/CD8_T-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/DN_B/B-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/DN_B/Plasmablast/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/EM_CD8/CD8_T-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/Fr_III_T/CD4_T-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/Fr_II_eTreg/CD4_T-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D_25751624/ImmuNexUT/Fr_I_nTreg/CD4_T-cell_naive/script_version/ results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D

In [20]:
run_samples

Unnamed: 0,gwas,eqtl_db,eqtl_origin,loop_origin,hichip_map,status,has_coloc,main_source,output,log
18,T1D_25751624,ImmuNexUT,CD16p_Mono,monocyte_naive,Y,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
19,T1D_25751624,ImmuNexUT,CL_Mono,monocyte_naive,Y,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
160,T1D_25751624,ImmuNexUT,CM_CD8,CD8_T-cell_naive,M,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
56,T1D_25751624,ImmuNexUT,DN_B,B-cell_naive,Y,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
57,T1D_25751624,ImmuNexUT,DN_B,Plasmablast,Y,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
161,T1D_25751624,ImmuNexUT,EM_CD8,CD8_T-cell_naive,M,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
91,T1D_25751624,ImmuNexUT,Fr_III_T,CD4_T-cell_naive,M,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
92,T1D_25751624,ImmuNexUT,Fr_II_eTreg,CD4_T-cell_naive,M,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
93,T1D_25751624,ImmuNexUT,Fr_I_nTreg,CD4_T-cell_naive,M,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...
20,T1D_25751624,ImmuNexUT,Int_Mono,monocyte_naive,Y,Not Run,1,ImmuNexUT,results/main/GRCh37/sgls/ldpairs/ImmuNexUT/T1D...,results/main/sgls/logs/annotate_colocs.T1D_257...


In [21]:
loops_tpl = 'results/main/h3k27ac_hichip/{loop_source}/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed'

## Logs of Jobs

In [22]:
check = samplesheet.groupby(['status'])

### Failed

In [23]:
failed = check.get_group('Zero SGLs')
failed = '\n'.join(failed.log.tolist())

In [24]:
print(failed)

results/main/sgls/logs/annotate_colocs.T1D_34012112_Gaulton.Quach_2016.monocyte_LPS.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_34012112_Gaulton.Schmiedel_2018.monocyte_CD16_naive.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_34012112_Gaulton.Schmiedel_2018.monocyte_naive.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.BLUEPRINT.T-cell.CD4_T-cell_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.BLUEPRINT.monocyte.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.GENCORD.T-cell.CD4_T-cell_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.Quach_2016.monocyte_IAV.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.Quach_2016.monocyte_LPS.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.Quach_2016.monocyte_Pam3CSK4.monocyte_naive.log
results/main/sgls/logs/annotate_colocs.T1D_25751624.Quach_2016.monocyte_R848.monocyte_naive.log
results/main/sgls/lo

### Status by GWAS source

In [25]:
status_agg = samplesheet.groupby(['has_coloc', 'main_source', 'gwas']).status.value_counts().to_frame()
status_agg.columns = ['count']

In [26]:
status_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
has_coloc,main_source,gwas,status,Unnamed: 4_level_1
0,ImmuNexUT,T1D_25751624,Not Run,1
0,ImmuNexUT,T1D_34594039_GCST90018925,Not Run,23
0,eQTL_Catalogue,T1D_25751624,Not Run,1
0,eQTL_Catalogue,T1D_32005708,Not Run,5
0,eQTL_Catalogue,T1D_34012112_Gaulton,Zero SGLs,3
0,eQTL_Catalogue,T1D_34012112_Gaulton,Not Run,2
0,eQTL_Catalogue,T1D_34594039_GCST90018925,Not Run,12
1,ImmuNexUT,T1D_25751624,Not Run,23
1,ImmuNexUT,T1D_32005708,Not Run,24
1,ImmuNexUT,T1D_34012112_Gaulton,Not Run,24


In [27]:
status_agg.loc[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
main_source,gwas,status,Unnamed: 3_level_1
ImmuNexUT,T1D_25751624,Not Run,23
ImmuNexUT,T1D_32005708,Not Run,24
ImmuNexUT,T1D_34012112_Gaulton,Not Run,24
ImmuNexUT,T1D_34594039_GCST90018925,Not Run,1
eQTL_Catalogue,T1D_25751624,Zero SGLs,21
eQTL_Catalogue,T1D_32005708,Zero SGLs,17
eQTL_Catalogue,T1D_34012112_Gaulton,Zero SGLs,17
eQTL_Catalogue,T1D_34594039_GCST90018925,Zero SGLs,10


In [28]:
main_source_summary =  samplesheet.groupby(['main_source']).status.value_counts().to_frame()
main_source_summary.columns = ['counts']

In [29]:
main_source_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
main_source,status,Unnamed: 2_level_1
ImmuNexUT,Not Run,96
eQTL_Catalogue,Zero SGLs,68
eQTL_Catalogue,Not Run,20


## Re-run a few


In [30]:
main_source_summary.sum()

counts    184
dtype: int64