# Make the master table

In [1]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

In [2]:
## default values for the command line
sys.argv = [0] * 8
sys.argv[1] =  'results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/'
sys.argv[1] += 'DICE_eQTL_CD4_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
sys.argv[2] = 'results/refs/ensembl/gencode.v19.annotation.bed'
sys.argv[3] = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_L/FitHiChIP.interactions_FitHiC_Q0.01.bed'
sys.argv[4] = 'results/refs/spp/SPP_D-Challenge_networks.xlsx'
sys.argv[5] = 'results/refs/hg19/hg19.chrom.sizes'
sys.argv[6] = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/DICE_eQTL/CD4_NAIVE.txt.gz'
sys.argv[7] = 'results/main/loop_analysis/washU/'

In [3]:
# parsing the commandline arguments
coloc_fn = sys.argv[1]
genes_fn = sys.argv[2]
loop_fn = sys.argv[3]
spp_fn = sys.argv[4]
gs_fn = sys.argv[5]
eqtl_fn = sys.argv[6]
outdir = sys.argv[7]

# setting the output file names
os.makedirs(outdir, exist_ok=True)

## Load the colocalization data

In [4]:
# load the colocalization data
coloc = pd.read_table(coloc_fn)

# extract the most significant according the H4 
coloc_sig_df = coloc[coloc['pp_H4_Coloc_Summary'] > 0.75]
coloc_sig_full = coloc_sig_df.copy(deep=True)
coloc_sig_df.rename(columns={'pos': 'end'}, inplace=True)
coloc_sig_df.loc[:, 'start'] = coloc_sig_df.loc[:, 'end'] - 1

coloc_sig_df = coloc_sig_df[['chr', 'start', 'end', 'rs_id', 'variant_id']]
coloc_sig_df = coloc_sig_df.loc[~coloc_sig_df.duplicated(subset='rs_id'),]
coloc_sig_pbt = pbt.BedTool.from_dataframe(coloc_sig_df.iloc[:, 0:4]).sort()

In [5]:
# get eQTL's
eqtls = pd.read_table(eqtl_fn)
eqtls.columns = ['eqtl_gname', 'nvar', 'shape1', 'shape2', 'dummy',
                 'sid', 'dist', 'npval', 'slope', 'ppval', 'bpval', 'qval']

In [6]:
eqtls

Unnamed: 0,eqtl_gname,nvar,shape1,shape2,dummy,sid,dist,npval,slope,ppval,bpval,qval
0,OR4G4P,314,1.004730,31.1310,60.1836,1:889158,836684,1.581000e-05,-0.408730,0.003996,1.903580e-03,0.025280
1,CDK11B,1886,1.064640,156.2080,59.3496,1:1649639,79035,3.484220e-07,-1.293760,0.000999,2.528920e-04,0.004851
2,SLC35E2B,1886,1.064460,151.1660,59.4453,1:1520725,-72215,1.883180e-07,-0.706011,0.000999,1.371590e-04,0.002890
3,SLC35E2,1886,1.046260,180.7480,61.8379,1:1520725,-135553,5.242940e-08,0.869440,0.000999,3.567280e-05,0.000909
4,RP1-140A9.1,1990,1.063150,163.5990,60.6484,1:1704795,-118116,2.085140e-10,1.142010,0.000999,2.163240e-07,0.000010
...,...,...,...,...,...,...,...,...,...,...,...,...
2499,LCN8,1832,1.016980,225.3640,64.4219,9:139648298,-541,1.094600e-08,0.688693,0.000999,7.762060e-06,0.000240
2500,MAN1B1,1844,0.978272,199.7390,65.0918,9:140004229,22849,5.197770e-07,-0.656218,0.001998,3.154600e-04,0.005862
2501,NSMF,1476,1.036940,160.1730,64.4355,9:140322640,-19383,5.903600e-07,0.637621,0.000999,1.936830e-04,0.003867
2502,PNPLA7,1476,1.045070,155.1850,63.4648,9:140576488,222083,2.779540e-10,0.809760,0.000999,1.462770e-07,0.000007


In [11]:
coloc_sig_df['sid'] = coloc_sig_df['chr'].str.replace('chr', '') + ':' + coloc_sig_df['end'].astype(str)

In [16]:
check = coloc_sig_df.merge(eqtls, on='sid', how='left')

In [18]:
check.shape

(9, 17)

In [19]:
eqtls

Unnamed: 0,eqtl_gname,nvar,shape1,shape2,dummy,sid,dist,npval,slope,ppval,bpval,qval
0,OR4G4P,314,1.004730,31.1310,60.1836,1:889158,836684,1.581000e-05,-0.408730,0.003996,1.903580e-03,0.025280
1,CDK11B,1886,1.064640,156.2080,59.3496,1:1649639,79035,3.484220e-07,-1.293760,0.000999,2.528920e-04,0.004851
2,SLC35E2B,1886,1.064460,151.1660,59.4453,1:1520725,-72215,1.883180e-07,-0.706011,0.000999,1.371590e-04,0.002890
3,SLC35E2,1886,1.046260,180.7480,61.8379,1:1520725,-135553,5.242940e-08,0.869440,0.000999,3.567280e-05,0.000909
4,RP1-140A9.1,1990,1.063150,163.5990,60.6484,1:1704795,-118116,2.085140e-10,1.142010,0.000999,2.163240e-07,0.000010
...,...,...,...,...,...,...,...,...,...,...,...,...
2499,LCN8,1832,1.016980,225.3640,64.4219,9:139648298,-541,1.094600e-08,0.688693,0.000999,7.762060e-06,0.000240
2500,MAN1B1,1844,0.978272,199.7390,65.0918,9:140004229,22849,5.197770e-07,-0.656218,0.001998,3.154600e-04,0.005862
2501,NSMF,1476,1.036940,160.1730,64.4355,9:140322640,-19383,5.903600e-07,0.637621,0.000999,1.936830e-04,0.003867
2502,PNPLA7,1476,1.045070,155.1850,63.4648,9:140576488,222083,2.779540e-10,0.809760,0.000999,1.462770e-07,0.000007


In [20]:
check.shape

(9, 17)

In [24]:
eqtls = eqtls.loc[:, ['sid', 'eqtl_gname']]

In [29]:
eqtls['sid'].value_counts()

17:44068492    3
1:110229787    3
1:247612596    3
7:99954393     3
18:3247256     3
              ..
1:64148077     1
19:17904166    1
1:151759104    1
1:7970092      1
17:44242606    1
Name: sid, Length: 2434, dtype: int64

In [30]:
eqtls[eqtls.sid == '17:44068492']

Unnamed: 0,sid,eqtl_gname
1009,17:44068492,RP11-798G7.8
1010,17:44068492,RP11-798G7.4
1016,17:44068492,RP11-995C19.2


In [31]:
eqtls[eqtls.sid == '1:110229787']

Unnamed: 0,sid,eqtl_gname
111,1:110229787,GSTM2
112,1:110229787,GSTM1
113,1:110229787,GSTM5


In [31]:
eqtls[eqtls.sid == '1:110229787']

Unnamed: 0,sid,eqtl_gname
111,1:110229787,GSTM2
112,1:110229787,GSTM1
113,1:110229787,GSTM5
