In [1]:
import os 
import subprocess as sp
import pandas as pd
import glob 
import seaborn as sns 
import matplotlib.pyplot as plt 

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

## Number of eQTL before filtering (eQTL Catalog)

In [2]:
before_data = []
before_glob = 'results/main/eqtl/*/ge/*_ge_*.all.prefilter.num_eqtls.txt'
before_glob = glob.glob(before_glob)

In [3]:
before_data = []
for fn in before_glob:
    with open(fn) as fr:
        
        eqtl, ge = os.path.basename(fn).split('_ge_')
        ge = ge.split('.')[0]
        
        num_eqtls = int(fr.read().strip())
        
        before_data.append([eqtl, ge, num_eqtls])

In [4]:
before_df = pd.DataFrame(before_data)
before_df.columns = ['eqtl', 'ge', 'num_eqtls_pre']

## Number of eQTL post filtering (eQTL Catalog)

In [5]:
post_data = []
post_glob = 'results/main/eqtl/*/ge/*_ge_*.all.postfilter.num_eqtls.txt'
post_glob = glob.glob(post_glob)

In [6]:
post_data = []
for fn in post_glob:
    with open(fn) as fr:
        
        eqtl, ge = os.path.basename(fn).split('_ge_')
        ge = ge.split('.')[0]
        
        num_eqtls = int(fr.read().strip())
        
        post_data.append([eqtl, ge, num_eqtls])

In [7]:
post_df = pd.DataFrame(post_data)
post_df.columns = ['eqtl', 'ge', 'num_eqtls_post']

In [8]:
post_df

Unnamed: 0,eqtl,ge,num_eqtls_post
0,GTEx,brain_hypothalamus,423929
1,GTEx,brain_putamen,559636
2,GTEx,artery_coronary,551959
3,GTEx,kidney_cortex,123311
4,GTEx,heart_left_ventricle,1091506
...,...,...,...
72,Schmiedel_2018,NK-cell_naive,472002
73,GENCORD,LCL,823148
74,GENCORD,fibroblast,575290
75,GENCORD,T-cell,716949


## Number of eQTL post filtering (Mu et al., 2021)

In [9]:
mu_data = []
mu_glob = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/*_eQTL/*.input.txt'
mu_glob = glob.glob(mu_glob)

In [10]:
mu_data = []
for fn in mu_glob:
    with open(fn) as fr:
                
        eqtl, ge = fn.split('/')[-2:]
        eqtl = eqtl.split('_')[0]
        ge = ge.split('.')[0]
       
        num_eqtls = len(fr.readlines())
        
        mu_data.append([eqtl, ge, num_eqtls])

In [11]:
mu_df = pd.DataFrame(mu_data)
mu_df.columns = ['eqtl', 'ge', 'num_eqtls_mu']

# initializing with the original name 
cell_dict = {k:v for k, v in zip(mu_df['ge'], mu_df['ge'])}

# change the cell line name for those cell lines I care about 
replace_dict = {'Neutrophil': 'neutrophil',
             'Monocyte': 'monocyte',
             'TREG_NAIVE': 'Treg_naive',
             'TREG_MEMORY': 'Treg_memory',
             'B_NAIVE': 'B-cell_naive',
             'TFH': 'Tfh_memory',
             'TH1-17': 'Th1-17_memory',
             'CD4_N_STIM': 'CD4_N_STIM',
             'NK_CD16POS': 'NK-cell_naive',
             'NONCLASSICAL_MONOCYTES': 'monocyte_CD16_naive',
             'CD8_NAIVE': 'CD8_T-cell_naive',
             'TH17': 'Th17_memory',
             'CD8_N_STIM': 'CD8_T-cell_anti-CD3-CD28',
             'CLASSICAL_MONOCYTES': 'monocyte_naive',
             'CD4_NAIVE': 'CD4_T-cell_naive	',
             'TH1': 'Th1_memory',
             'TH2': 'Th2_memory'}
cell_dict.update(replace_dict)
mu_df['ge'] = mu_df['ge'].map(cell_dict, )

In [30]:
# replace the eqtl source name when necessary
eqtl_sources = {k: v for k, v in zip(mu_df['eqtl'], mu_df['eqtl'])}
eqtl_sources['DICE'] = 'Schmiedel_2018'
mu_df['eqtl'] = mu_df['eqtl'].map(eqtl_sources)

In [31]:
mu_df.head()

Unnamed: 0,eqtl,ge,num_eqtls_mu
0,GEUVADIS,CEU,9294
1,GEUVADIS,YRI,2474
2,Schmiedel_2018,Treg_naive,2106
3,Schmiedel_2018,Treg_memory,1972
4,Schmiedel_2018,B-cell_naive,1827


## Compare the eQTL Catalog before and after FDR filtering

In [42]:
all_df = pd.merge(before_df, post_df, on=['eqtl', 'ge'], how='outer')
all_df['%Post'] = all_df['num_eqtls_post'] / all_df['num_eqtls_pre'] * 100

In [43]:
pd.set_option('display.max_rows', 100) 
display(all_df)
pd.set_option('display.max_rows', 20) 

Unnamed: 0,eqtl,ge,num_eqtls_pre,num_eqtls_post,%Post
0,GTEx,stomach,164513595,869131,0.528303
1,GTEx,brain_caudate,175054708,696488,0.397869
2,GTEx,muscle,134303457,1672825,1.245556
3,GTEx,brain_spinal_cord,168668568,396491,0.235071
4,GTEx,brain_cerebellum,167769402,1347204,0.803009
5,GTEx,artery_coronary,162448099,551959,0.339776
6,GTEx,breast,172228883,1122669,0.651847
7,GTEx,ovary,163412345,449356,0.274983
8,GTEx,colon_transverse,168613256,1147373,0.680476
9,GTEx,vagina,167971169,303041,0.180413


In [45]:
summary = all_df.drop('%Post', axis=1).describe()

In [46]:
summary

Unnamed: 0,num_eqtls_pre,num_eqtls_post
count,77.0,77.0
mean,157868700.0,995474.5
std,19585840.0,619473.5
min,86606710.0,123311.0
25%,151397500.0,548608.0
50%,161593000.0,782838.0
75%,169669500.0,1299706.0
max,217040900.0,2946020.0


## Add Mu et al eQTL's to compare

In [47]:
all_plus_mu = all_df.merge(mu_df, on=['eqtl', 'ge'])

In [51]:
all_plus_mu['%Mu'] = all_plus_mu['num_eqtls_mu'] / all_plus_mu['num_eqtls_pre'] * 100

In [52]:
all_plus_mu

Unnamed: 0,eqtl,ge,num_eqtls_pre,num_eqtls_post,%Post,num_eqtls_mu,%Mu
0,BLUEPRINT,monocyte,107504217,2213353,2.058852,7717,0.007178
1,BLUEPRINT,T-cell,114006812,1818085,1.594716,7212,0.006326
2,BLUEPRINT,neutrophil,86606708,1838013,2.122252,6400,0.00739
3,Schmiedel_2018,Treg_memory,155297785,588584,0.379003,1972,0.00127
4,Schmiedel_2018,monocyte_naive,138656372,643412,0.464033,2253,0.001625
5,Schmiedel_2018,Treg_naive,156779364,623104,0.39744,2106,0.001343
6,Schmiedel_2018,monocyte_CD16_naive,139733495,614987,0.440114,2105,0.001506
7,Schmiedel_2018,Tfh_memory,155294143,603596,0.388679,2054,0.001323
8,Schmiedel_2018,NK-cell_naive,156385911,472002,0.301819,1883,0.001204
9,Schmiedel_2018,CD8_T-cell_naive,155723260,706670,0.453799,2374,0.001524


What is causing the difference between the Mu et al 2021 data and the eQTL Catalogue? In the next statements I'm considering BLUEPRINT monocytes. The eQTL Catalog we see 107,504,217 eQTL tests whereas for Mu et al we see 7,717 tests. Even after filtering the eQTL Catalogue for FDR < 0.05 there are 2,213,353 eQTL tests.   

#### Questions
- Difference in the FDR?

#### Answers
- Mu et al uses GRCh37, eQTL catalogue uses GRCh38
- Mu et al uses GENCODE v19, eQTL catalog uses GENCODE v30
- The Additional File 1 Table 8 says that the Zenodo files are: "List of significant eQTLs and sQTLs"