# Colocalization Tracker

This tracker is focused on the rule **run_colocalization_eqtl_catalog** which produces:

**output:**<br>
    final = protected('results/main/coloc/Results/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed')

**log:**<br>
    'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.{gwas_source}.{eqtl_source}.{ge_source}.log'

In [13]:
import os
import pandas as pd 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pd.set_option('display.max_rows', 500)

In [14]:
# load the colocalization samplesheet
samplesheet = pd.read_table('config/coloc_samplesheets/coloc.samplesheet.tsv')

In [15]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin
0,T1D_34012112_Gaulton,BLUEPRINT,monocyte
1,T1D_34012112_Gaulton,BLUEPRINT,neutrophil
2,T1D_34012112_Gaulton,BLUEPRINT,T-cell
3,T1D_34012112_Gaulton,GENCORD,LCL
4,T1D_34012112_Gaulton,GENCORD,T-cell
5,T1D_34012112_Gaulton,GTEx,blood
6,T1D_34012112_Gaulton,GTEx,LCL
7,T1D_34012112_Gaulton,GTEx,pancreas
8,T1D_34012112_Gaulton,Lepik_2017,blood
9,T1D_34012112_Gaulton,Quach_2016,monocyte_IAV


In [39]:
# check each file in the samplesheet for a log and final file
output_tpl = 'results/main/coloc/Results/{main_source}/{gwas_source}/{eqtl_source}/{ge_source}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
log_tpl = 'results/main/coloc/Results/{main_source}/logs/run_colocalization_eqtl_catalog.{gwas_source}.{eqtl_source}.{ge_source}.log'
new_data_cols = []
for i, sr in samplesheet.iterrows():
    
    # get the label for the main source 
    if sr.eqtl_db == 'ImmuNexUT':
        main_source = 'ImmuNexUT'
    else:
        main_source = 'eQTL_Catalogue'
        
    # extract source data location
    d = {'gwas_source': sr.gwas,
         'eqtl_source': sr.eqtl_db,
         'ge_source': sr.eqtl_origin,
         'main_source': main_source}

    # identify the status using the presence of output ana log files
    output = output_tpl.format(**d)
    log = log_tpl.format(**d)
    
    if os.path.exists(output):
        status = 'Complete'
    elif os.path.exists(log):
        status = 'Zero Colocs'
    else:
        status = 'Not Run'
        
    # append the main source and status to the new data cols
    new_data_cols.append([status, main_source, output, log])

In [40]:
samplesheet['status'], samplesheet['main_source'], \
    samplesheet['output'], samplesheet['log'] = zip(*new_data_cols)
samplesheet.sort_values(['main_source', 'status', 'gwas', 'eqtl_db', 'eqtl_origin'], inplace=True)

In [41]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin,status,main_source,output,log
148,T1D_25751624,ImmuNexUT,CD16p_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
149,T1D_25751624,ImmuNexUT,CL_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
150,T1D_25751624,ImmuNexUT,CM_CD8,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
151,T1D_25751624,ImmuNexUT,DN_B,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
152,T1D_25751624,ImmuNexUT,EM_CD8,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
154,T1D_25751624,ImmuNexUT,Fr_III_T,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
153,T1D_25751624,ImmuNexUT,Fr_II_eTreg,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
155,T1D_25751624,ImmuNexUT,Fr_I_nTreg,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
156,T1D_25751624,ImmuNexUT,Int_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
157,T1D_25751624,ImmuNexUT,LDG,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...


## Logs of Jobs

In [60]:
check = samplesheet.groupby(['status'])

### Failed

In [57]:
failed = check.get_group('Zero Colocs')
failed = '\n'.join(failed.log.tolist())

In [58]:
print(failed)

results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_25751624.ImmuNexUT.Naive_CD4.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_25751624.ImmuNexUT.mDC.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.CD16p_Mono.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.CM_CD8.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.DN_B.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.EM_CD8.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.Fr_III_T.log
results/main/coloc/Results/ImmuNexUT/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.ImmuNexUT.Fr_II_eTreg.log
results/main/coloc/Results/ImmuNexUT/logs/run_col

### Status by GWAS source

In [32]:
status_agg = samplesheet.groupby(['main_source', 'gwas']).status.value_counts().to_frame()

In [33]:
status_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,status
main_source,gwas,status,Unnamed: 3_level_1
ImmuNexUT,T1D_25751624,Complete,26
ImmuNexUT,T1D_25751624,Zero Colocs,2
ImmuNexUT,T1D_32005708,Complete,28
ImmuNexUT,T1D_34012112_Gaulton,Complete,28
ImmuNexUT,T1D_34594039_GCST90018925,Zero Colocs,27
ImmuNexUT,T1D_34594039_GCST90018925,Complete,1
eQTL_Catalogue,T1D_25751624,Complete,28
eQTL_Catalogue,T1D_25751624,Zero Colocs,2
eQTL_Catalogue,T1D_32005708,Complete,24
eQTL_Catalogue,T1D_32005708,Zero Colocs,6


In [34]:
 samplesheet.groupby(['main_source']).status.value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,status
main_source,status,Unnamed: 2_level_1
ImmuNexUT,Complete,83
ImmuNexUT,Zero Colocs,29
eQTL_Catalogue,Complete,82
eQTL_Catalogue,Zero Colocs,38


## Re-run a few


In [59]:
memory_failed_logs = ['results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34012112_Gaulton.Quach_2016.monocyte_IAV.log',
        'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34012112_Gaulton.Schmiedel_2018.CD8_T-cell_anti-CD3-CD28.log',
        'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.GTEx.blood.log',
        'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.Quach_2016.monocyte_LPS.log',
        'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.Schmiedel_2018.CD4_T-cell_anti-CD3-CD28.log',
        'results/main/coloc/Results/eQTL_Catalogue/logs/run_colocalization_eqtl_catalog.T1D_34594039_GCST90018925.Schmiedel_2018.Treg_naive.log']

In [61]:
rerun = samplesheet.loc[samplesheet['log'].isin(memory_failed_logs)]

In [67]:
rerun_list = rerun.output.tolist()
rerun_list = ' '.join(rerun_list)

In [68]:
rerun_list

'results/main/coloc/Results/eQTL_Catalogue/T1D_34012112_Gaulton/Quach_2016/monocyte_IAV/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed results/main/coloc/Results/eQTL_Catalogue/T1D_34012112_Gaulton/Schmiedel_2018/CD8_T-cell_anti-CD3-CD28/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed results/main/coloc/Results/eQTL_Catalogue/T1D_34594039_GCST90018925/GTEx/blood/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed results/main/coloc/Results/eQTL_Catalogue/T1D_34594039_GCST90018925/Quach_2016/monocyte_LPS/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed results/main/coloc/Results/eQTL_Catalogue/T1D_34594039_GCST90018925/Schmiedel_2018/CD4_T-cell_anti-CD3-CD28/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed results/main/coloc/Results/eQTL_Catalogue/T1D_34594039_GCST90018925/Schmiedel_2018/Treg_naive/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'

## TEMP Getting ImmuNexUT cell line names

In [71]:
samplesheet

Unnamed: 0,gwas,eqtl_db,eqtl_origin,status,main_source,output,log
148,T1D_25751624,ImmuNexUT,CD16p_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
149,T1D_25751624,ImmuNexUT,CL_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
150,T1D_25751624,ImmuNexUT,CM_CD8,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
151,T1D_25751624,ImmuNexUT,DN_B,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
152,T1D_25751624,ImmuNexUT,EM_CD8,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
154,T1D_25751624,ImmuNexUT,Fr_III_T,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
153,T1D_25751624,ImmuNexUT,Fr_II_eTreg,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
155,T1D_25751624,ImmuNexUT,Fr_I_nTreg,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
156,T1D_25751624,ImmuNexUT,Int_Mono,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...
157,T1D_25751624,ImmuNexUT,LDG,Complete,ImmuNexUT,results/main/coloc/Results/ImmuNexUT/T1D_25751...,results/main/coloc/Results/ImmuNexUT/logs/run_...


In [94]:
imm = [
 'CD16p_Mono',
 'CL_Mono',
 'Int_Mono',
 'NC_Mono',
 'Naive_B',
 'USM_B',
 'SM_B',
 'DN_B',
 'Plasmablast',
 'Tfh',
 'Th1',
 'Th17',
 'Th2',
 'Fr_III_T',
 'Fr_II_eTreg',
 'Fr_I_nTreg',
 'Naive_CD4',
 'Mem_CD4',
 'Naive_CD8',
 'Mem_CD8',
 'CM_CD8',
 'EM_CD8',
 'TEMRA_CD8',
 'pDC',
 'mDC',
 'Neu',
 'LDG',
 'NK']
imm = pd.DataFrame(imm)
imm.columns = ['lower_order']

In [95]:
imm['higher_order'] = ['Monocytes'] * 4 + ['Bcells'] * 5 + ['CD4 Tcells'] * 9  + ['CD8 Tcells'] * 5 + ['Other'] * 5

In [96]:
imm

Unnamed: 0,lower_order,higher_order
0,CD16p_Mono,Monocytes
1,CL_Mono,Monocytes
2,Int_Mono,Monocytes
3,NC_Mono,Monocytes
4,Naive_B,Bcells
5,USM_B,Bcells
6,SM_B,Bcells
7,DN_B,Bcells
8,Plasmablast,Bcells
9,Tfh,CD4 Tcells


In [100]:
imm_str = '\n'.join(imm.lower_order.values.tolist())

In [102]:
print(imm_str)

CD16p_Mono
CL_Mono
Int_Mono
NC_Mono
Naive_B
USM_B
SM_B
DN_B
Plasmablast
Tfh
Th1
Th17
Th2
Fr_III_T
Fr_II_eTreg
Fr_I_nTreg
Naive_CD4
Mem_CD4
Naive_CD8
Mem_CD8
CM_CD8
EM_CD8
TEMRA_CD8
pDC
mDC
Neu
LDG
NK
