In [19]:
import os 
import subprocess as sp
import pandas as pd
import glob 
import seaborn as sns 
import matplotlib.pyplot as plt 

pd.set_option('display.max_rows', 500)

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

outdir = 'results/notebooks/eqtl_report/'
os.makedirs(outdir, exist_ok=True)

## Number of eQTLs (FDR < 0.05)

## Number of eQTL before filtering (eQTL Catalog)

In [20]:
before_data = []
before_glob = 'results/main/eqtl/*/ge/*_ge_*.all.prefilter.num_eqtls.txt'
before_glob = glob.glob(before_glob)

In [21]:
before_data = []
for fn in before_glob:
    with open(fn) as fr:
        
        eqtl, ge = os.path.basename(fn).split('_ge_')
        ge = ge.split('.')[0]
        
        num_eqtls = int(fr.read().strip())
        
        before_data.append([eqtl, ge, num_eqtls])

In [22]:
before_df = pd.DataFrame(before_data)
before_df.columns = ['eqtl', 'ge', 'num_eqtls_pre']

## Number of eQTL post filtering (eQTL Catalog)

In [23]:
post_data = []
post_glob = 'results/main/eqtl/*/ge/*_ge_*.all.postfilter.num_eqtls.txt'
post_glob = glob.glob(post_glob)

In [24]:
post_data = []
for fn in post_glob:
    with open(fn) as fr:
        
        eqtl, ge = os.path.basename(fn).split('_ge_')
        ge = ge.split('.')[0]
        
        num_eqtls = int(fr.read().strip())
        
        post_data.append([eqtl, ge, num_eqtls])

In [25]:
post_df = pd.DataFrame(post_data)
post_df.columns = ['eqtl', 'ge', 'num_eqtls_post']

In [26]:
post_df

Unnamed: 0,eqtl,ge,num_eqtls_post
0,GTEx,brain_hypothalamus,423929
1,GTEx,brain_putamen,559636
2,GTEx,artery_coronary,551959
3,GTEx,kidney_cortex,123311
4,GTEx,heart_left_ventricle,1091506
5,GTEx,esophagus_gej,1093510
6,GTEx,pancreas,1063897
7,GTEx,skin_not_sun_exposed,1664368
8,GTEx,pituitary,894226
9,GTEx,brain_anterior_cingulate_cortex,458124


## Compare the eQTL data  before and after FDR filtering

In [27]:
all_df = pd.merge(before_df, post_df, on=['eqtl', 'ge'], how='outer')
all_df['%Post'] = all_df['num_eqtls_post'] / all_df['num_eqtls_pre'] * 100

In [28]:
pd.set_option('display.max_rows', 100) 
display(all_df)
pd.set_option('display.max_rows', 20) 

Unnamed: 0,eqtl,ge,num_eqtls_pre,num_eqtls_post,%Post
0,GTEx,stomach,164513595,869131,0.528303
1,GTEx,brain_caudate,175054708,696488,0.397869
2,GTEx,muscle,134303457,1672825,1.245556
3,GTEx,brain_spinal_cord,168668568,396491,0.235071
4,GTEx,brain_cerebellum,167769402,1347204,0.803009
5,GTEx,artery_coronary,162448099,551959,0.339776
6,GTEx,breast,172228883,1122669,0.651847
7,GTEx,ovary,163412345,449356,0.274983
8,GTEx,colon_transverse,168613256,1147373,0.680476
9,GTEx,vagina,167971169,303041,0.180413


In [30]:
t1d_related = pd.read_table('config/eqtl.samplesheet.tsv')
t1d_related_eqtls = [['BLUEPRINT', 'monocyte'],
     ['BLUEPRINT', 'neutrophil'],
     ['BLUEPRINT', 'T-cell'],
     ['GENCORD', 'LCL'],
     ['GENCORD', 'T-cell'],
     ['GTEx', 'blood'],
     ['GTEx', 'LCL'],
     ['GTEx', 'liver'],
     ['GTEx', 'pancreas'],
     ['Lepik_2017', 'blood'],
     ['Quach_2016', 'monocyte_IAV'],
     ['Quach_2016', 'monocyte_LPS'],
     ['Quach_2016', 'monocyte_naive'],
     ['Quach_2016', 'monocyte_Pam3CSK4'],
     ['Quach_2016', 'monocyte_R848'],
     ['Schmiedel_2018', 'B-cell_naive'],
     ['Schmiedel_2018', 'CD4_T-cell_anti-CD3-CD28'],
     ['Schmiedel_2018', 'CD4_T-cell_naive'],
     ['Schmiedel_2018', 'CD8_T-cell_anti-CD3-CD28'],
     ['Schmiedel_2018', 'CD8_T-cell_naive'],
     ['Schmiedel_2018', 'monocyte_CD16_naive'],
     ['Schmiedel_2018', 'monocyte_naive'],
     ['Schmiedel_2018', 'NK-cell_naive'],
     ['Schmiedel_2018', 'Tfh_memory'],
     ['Schmiedel_2018', 'Th1-17_memory'],
     ['Schmiedel_2018', 'Th17_memory'],
     ['Schmiedel_2018', 'Th1_memory'],
     ['Schmiedel_2018', 'Th2_memory'],
     ['Schmiedel_2018', 'Treg_memory'],
     ['Schmiedel_2018', 'Treg_naive'],
     ['van_de_Bunt_2015', 'pancreatic_islet'],
     ['ImmuNexUT', 'CD16p_Mono'],
     ['ImmuNexUT', 'CL_Mono'],
     ['ImmuNexUT', 'CM_CD8'],
     ['ImmuNexUT', 'DN_B'],
     ['ImmuNexUT', 'EM_CD8'],
     ['ImmuNexUT', 'Fr_II_eTreg'],
     ['ImmuNexUT', 'Fr_III_T'],
     ['ImmuNexUT', 'Fr_I_nTreg'],
     ['ImmuNexUT', 'Int_Mono'],
     ['ImmuNexUT', 'LDG'],
     ['ImmuNexUT', 'mDC'],
     ['ImmuNexUT', 'Mem_CD4'],
     ['ImmuNexUT', 'Mem_CD8'],
     ['ImmuNexUT', 'Naive_B'],
     ['ImmuNexUT', 'Naive_CD4'],
     ['ImmuNexUT', 'Naive_CD8'],
     ['ImmuNexUT', 'NC_Mono'],
     ['ImmuNexUT', 'Neu'],
     ['ImmuNexUT', 'NK'],
     ['ImmuNexUT', 'pDC'],
     ['ImmuNexUT', 'Plasmablast'],
     ['ImmuNexUT', 'SM_B'],
     ['ImmuNexUT', 'TEMRA_CD8'],
     ['ImmuNexUT', 'Tfh'],
     ['ImmuNexUT', 'Th17'],
     ['ImmuNexUT', 'Th1'],
     ['ImmuNexUT', 'Th2'],
     ['ImmuNexUT', 'USM_B']]

In [56]:
num_immunexut = len([x[0] for x in t1d_related_eqtls if x[0] == 'ImmuNexUT'])
curr_poss_eqtls = len(t1d_related_eqtls) - num_immunexut
print(curr_poss_eqtls)

31


In [57]:
bools = []
for sr in all_df.iloc[:, [0,1]].values:
    if sr.tolist() in t1d_related_eqtls:
        bools.append(True)
    else:
        bools.append(False)

In [58]:
all_df = all_df.loc[bools]

In [59]:
all_df.shape

(31, 5)

In [65]:
post_eqtl_means = all_df.num_eqtls_post.mean()
print('Mean number of significant eQTLs: {:.1f}'.format(post_eqtl_means))

Mean number of significant eQTLs: 928107.6


## Write the report

In [25]:
report_fn = os.path.join(outdir, 'eqtl_report.tsv')
all_df.to_csv(report_fn, sep='\t', index=False)