# Make the master table

In [1]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

In [2]:
## default values for the command line
sys.argv = [0] * 8
sys.argv[1] =  'results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/'
sys.argv[1] += 'DICE_eQTL_CD4_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
sys.argv[2] = 'results/refs/ensembl/gencode.v19.annotation.bed'
sys.argv[3] = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_L/FitHiChIP.interactions_FitHiC_Q0.01.bed'
sys.argv[4] = 'results/refs/spp/SPP_D-Challenge_networks.xlsx'
sys.argv[5] = 'results/refs/hg19/hg19.chrom.sizes'
sys.argv[6] = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/DICE_eQTL/CD4_NAIVE.txt.gz'
sys.argv[7] = 'results/main/loop_analysis/washU2/'

In [3]:
# parsing the commandline arguments
coloc_fn = sys.argv[1]
genes_fn = sys.argv[2]
loop_fn = sys.argv[3]
spp_fn = sys.argv[4]
gs_fn = sys.argv[5]
eqtl_fn = sys.argv[6]
outdir = sys.argv[7]

# setting the output file names
os.makedirs(outdir, exist_ok=True)

## Load the colocalization data

In [4]:
# load the colocalization data
coloc = pd.read_table(coloc_fn)

print(coloc.shape)

# extract the most significant according the H4 
coloc_sig_df = coloc[coloc['pp_H4_Coloc_Summary'] > 0.75]
print(coloc_sig_df.shape)

coloc_sig_full = coloc_sig_df.copy(deep=True)
coloc_sig_df.rename(columns={'pos': 'end'}, inplace=True)
coloc_sig_df.loc[:, 'start'] = coloc_sig_df.loc[:, 'end'] - 1

coloc_sig_df = coloc_sig_df[['chr', 'start', 'end', 'rs_id', 'variant_id']]
coloc_sig_df = coloc_sig_df.loc[~coloc_sig_df.duplicated(subset='rs_id'),]
coloc_sig_pbt = pbt.BedTool.from_dataframe(coloc_sig_df.iloc[:, 0:4]).sort()

(10, 24)
(10, 24)


## Load the gene data

In [5]:
# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode.type.isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5]]
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

## Find all genes +/- 500kb

In [6]:
# get a list of gene names within +- 500kb of the SNPs
fivekb_genes = coloc_sig_pbt.slop(b=500000, g=gs_fn)
fivekb_genes = fivekb_genes.intersect(genes_pbt, wa=True, wb=True)
fivekb_genes = fivekb_genes.to_dataframe().iloc[:, [0,1,2,4,5,6,3,7,8]]
fivekb_genes.columns = bedpe_6cols + ['rs_id', 'gname', 'gid']
fivekb_genes['startA'] += 500000
fivekb_genes['endA'] -= 500000
fivekb_genes['sid'] = fivekb_genes['chrA'].str.replace('chr', '') + ':' + fivekb_genes['endA'].astype(str)

In [7]:
fivekb_genes.head()

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,rs_id,gname,gid,sid
0,chr11,64107476,64107477,chr11,63650059,63650157,rs479777,RNU6-1306P,ENSG00000202089.1,11:64107477
1,chr11,64107476,64107477,chr11,63678693,63684316,rs479777,RCOR2,ENSG00000167771.5,11:64107477
2,chr11,64107476,64107477,chr11,63737942,63738048,rs479777,RNU6-45P,ENSG00000207200.1,11:64107477
3,chr11,64107476,64107477,chr11,63742079,63744015,rs479777,COX8A,ENSG00000176340.3,11:64107477
4,chr11,64107476,64107477,chr11,63849162,63854647,rs479777,RP11-21A7A.4,ENSG00000256481.1,11:64107477


## Find the closest gene

In [8]:
closest_gene = coloc_sig_pbt.closest(genes_pbt, d=True)
closest_gene = closest_gene.to_dataframe().iloc[:, [0,1,2,4,5,6,3,7,8,9]]
closest_gene.columns = bedpe_6cols + ['rs_id', 'gname', 'gid', 'dist']
closest_gene.set_index(['rs_id', 'gname'], inplace=True)

# get eQTL's
eqtls = pd.read_table(eqtl_fn)
eqtls.columns = ['eqtl_gname', 'nvar', 'shape1', 'shape2', 'dummy',
                 'sid', 'dist', 'npval', 'slope', 'ppval', 'bpval', 'qval']

In [9]:
eqtls

Unnamed: 0,eqtl_gname,nvar,shape1,shape2,dummy,sid,dist,npval,slope,ppval,bpval,qval
0,OR4G4P,314,1.004730,31.1310,60.1836,1:889158,836684,1.581000e-05,-0.408730,0.003996,1.903580e-03,0.025280
1,CDK11B,1886,1.064640,156.2080,59.3496,1:1649639,79035,3.484220e-07,-1.293760,0.000999,2.528920e-04,0.004851
2,SLC35E2B,1886,1.064460,151.1660,59.4453,1:1520725,-72215,1.883180e-07,-0.706011,0.000999,1.371590e-04,0.002890
3,SLC35E2,1886,1.046260,180.7480,61.8379,1:1520725,-135553,5.242940e-08,0.869440,0.000999,3.567280e-05,0.000909
4,RP1-140A9.1,1990,1.063150,163.5990,60.6484,1:1704795,-118116,2.085140e-10,1.142010,0.000999,2.163240e-07,0.000010
...,...,...,...,...,...,...,...,...,...,...,...,...
2499,LCN8,1832,1.016980,225.3640,64.4219,9:139648298,-541,1.094600e-08,0.688693,0.000999,7.762060e-06,0.000240
2500,MAN1B1,1844,0.978272,199.7390,65.0918,9:140004229,22849,5.197770e-07,-0.656218,0.001998,3.154600e-04,0.005862
2501,NSMF,1476,1.036940,160.1730,64.4355,9:140322640,-19383,5.903600e-07,0.637621,0.000999,1.936830e-04,0.003867
2502,PNPLA7,1476,1.045070,155.1850,63.4648,9:140576488,222083,2.779540e-10,0.809760,0.000999,1.462770e-07,0.000007


## Get the loops

In [10]:
# load the loop data
loops = pd.read_table(loop_fn)
tmp_loops = loops[['chr1', 's1', 'e1', 'chr2', 's2', 'e2']]
tmp_loops.rename(columns={'p': 'score'}, inplace=True)
tmp_loops.loc[:, 'name'] = '.'
tmp_loops.loc[:, 'score'] = loops['p']
tmp_loops.loc[:, 'strand1'] = '.'
tmp_loops.loc[:, 'strand2'] = '.'
loops = pbt.BedTool.from_dataframe(tmp_loops)
print('FitHiChIP found {} significant loops.'.format(tmp_loops.shape[0]))

# #### Find out SNP - 5kb gene pairs with loops\
# re-arranging to fit bedpe format
fivekb_gloops = fivekb_genes.copy()
fivekb_gloops['dummy'] = 'drop'

# loading into pbt
fivekb_gloops = pbt.BedTool.from_dataframe(fivekb_gloops)
fivekb_gloops = fivekb_gloops.pair_to_pair(loops, type='both',  **{'is':True})
fivekb_gloops = fivekb_gloops.to_dataframe(disable_auto_names=True, header=None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.or

FitHiChIP found 577026 significant loops.


In [11]:
fivekb_gloops_set = fivekb_gloops.iloc[:, [9,8]]
fivekb_gloops_uniq = set([tuple(x) for x in fivekb_gloops_set.values.tolist()])

## Construct master table

In [14]:
master.head()

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,rs_id,gname,gid,sid
0,chr11,64107476,64107477,chr11,63650059,63650157,rs479777,RNU6-1306P,ENSG00000202089.1,11:64107477
1,chr11,64107476,64107477,chr11,63678693,63684316,rs479777,RCOR2,ENSG00000167771.5,11:64107477
2,chr11,64107476,64107477,chr11,63737942,63738048,rs479777,RNU6-45P,ENSG00000207200.1,11:64107477
3,chr11,64107476,64107477,chr11,63742079,63744015,rs479777,COX8A,ENSG00000176340.3,11:64107477
4,chr11,64107476,64107477,chr11,63849162,63854647,rs479777,RP11-21A7A.4,ENSG00000256481.1,11:64107477


In [13]:
# begin making the master
master = fivekb_genes.copy()
master['sid'] = master['chrA'].str.replace('chr', '') + ':' + master['endA'].astype(str)

In [15]:
# check for the closets gene
closets_check = [0] * master.shape[0]
for i, sr in master.iterrows():

    # check closest gene
    rs_gene = (sr.rs_id, sr.gname)
    if rs_gene in closest_gene.index:
        closets_check[i] = 1

master['is_closest_gene'] = closets_check

# add eqtl results 
#master = master.merge(eqtls, left_on=['sid'], right_on=['sid'], how='outer')
master = master.merge(eqtls, left_on=['sid', 'gname'], right_on=['sid', 'eqtl_gname'], how='outer')

# add column to filter on eqtl snp status
master['is_eqtl_pair'] = (~master['ppval'].isna()).astype(int)

# add gene names to entries with a missing name 
master.loc[master.gname.isna(), 'gname'] = master.loc[master.gname.isna(), 'eqtl_gname']

In [16]:
master.head()

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,rs_id,gname,gid,sid,...,shape1,shape2,dummy,dist,npval,slope,ppval,bpval,qval,is_eqtl_pair
0,chr11,64107476.0,64107477.0,chr11,63650059.0,63650157.0,rs479777,RNU6-1306P,ENSG00000202089.1,11:64107477,...,,,,,,,,,,0
1,chr11,64107476.0,64107477.0,chr11,63678693.0,63684316.0,rs479777,RCOR2,ENSG00000167771.5,11:64107477,...,,,,,,,,,,0
2,chr11,64107476.0,64107477.0,chr11,63737942.0,63738048.0,rs479777,RNU6-45P,ENSG00000207200.1,11:64107477,...,,,,,,,,,,0
3,chr11,64107476.0,64107477.0,chr11,63742079.0,63744015.0,rs479777,COX8A,ENSG00000176340.3,11:64107477,...,,,,,,,,,,0
4,chr11,64107476.0,64107477.0,chr11,63849162.0,63854647.0,rs479777,RP11-21A7A.4,ENSG00000256481.1,11:64107477,...,,,,,,,,,,0


In [17]:
# add colocalization data for SNP and is_coloc_snp columns
tmp_coloc = coloc_sig_full[[
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'rs_id',
 'geneName',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'slope_gwas',
 'slope_se_gwas',
 'pval_nominal',
 'SampleSize']]
tmp_coloc.rename(columns={'slope_gwas': 'gwas_slope',
                          'slope_se_gwas': 'gwas_slope_se',
                          'pval_nominal': 'gwas_pval_nominal',
                          'geneName': 'gname'}, inplace=True)
master = master.merge(tmp_coloc, on=['rs_id', 'gname'], how='left')

# add column to filter on coloc snp status
master['is_coloc_pair'] = (~master['pp_H4_Coloc_Summary'].isna()).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
master

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,rs_id,gname,gid,sid,...,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval_nominal,SampleSize,is_coloc_pair
0,chr11,64107476.0,64107477.0,chr11,63650059.0,63650157.0,rs479777,RNU6-1306P,ENSG00000202089.1,11:64107477,...,,,,,,,,,,0
1,chr11,64107476.0,64107477.0,chr11,63678693.0,63684316.0,rs479777,RCOR2,ENSG00000167771.5,11:64107477,...,,,,,,,,,,0
2,chr11,64107476.0,64107477.0,chr11,63737942.0,63738048.0,rs479777,RNU6-45P,ENSG00000207200.1,11:64107477,...,,,,,,,,,,0
3,chr11,64107476.0,64107477.0,chr11,63742079.0,63744015.0,rs479777,COX8A,ENSG00000176340.3,11:64107477,...,,,,,,,,,,0
4,chr11,64107476.0,64107477.0,chr11,63849162.0,63854647.0,rs479777,RP11-21A7A.4,ENSG00000256481.1,11:64107477,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2852,,,,,,,,LCN8,,9:139648298,...,,,,,,,,,,0
2853,,,,,,,,MAN1B1,,9:140004229,...,,,,,,,,,,0
2854,,,,,,,,NSMF,,9:140322640,...,,,,,,,,,,0
2855,,,,,,,,PNPLA7,,9:140576488,...,,,,,,,,,,0


In [19]:
# check for the loop gene
loop_check = [0] * master.shape[0]
for i, sr in master.iterrows():

    # check closest gene
    rs_gene = (sr.sid, sr.gid)
    if rs_gene in fivekb_gloops_uniq:
        loop_check[i] = 1

master['has_fithichip_loop'] = loop_check

In [20]:
master

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,rs_id,gname,gid,sid,...,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval_nominal,SampleSize,is_coloc_pair,has_fithichip_loop
0,chr11,64107476.0,64107477.0,chr11,63650059.0,63650157.0,rs479777,RNU6-1306P,ENSG00000202089.1,11:64107477,...,,,,,,,,,0,1
1,chr11,64107476.0,64107477.0,chr11,63678693.0,63684316.0,rs479777,RCOR2,ENSG00000167771.5,11:64107477,...,,,,,,,,,0,0
2,chr11,64107476.0,64107477.0,chr11,63737942.0,63738048.0,rs479777,RNU6-45P,ENSG00000207200.1,11:64107477,...,,,,,,,,,0,0
3,chr11,64107476.0,64107477.0,chr11,63742079.0,63744015.0,rs479777,COX8A,ENSG00000176340.3,11:64107477,...,,,,,,,,,0,1
4,chr11,64107476.0,64107477.0,chr11,63849162.0,63854647.0,rs479777,RP11-21A7A.4,ENSG00000256481.1,11:64107477,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2852,,,,,,,,LCN8,,9:139648298,...,,,,,,,,,0,0
2853,,,,,,,,MAN1B1,,9:140004229,...,,,,,,,,,0,0
2854,,,,,,,,NSMF,,9:140322640,...,,,,,,,,,0,0
2855,,,,,,,,PNPLA7,,9:140576488,...,,,,,,,,,0,0


In [21]:
master = master[[
 'sid',
 'rs_id',
 'gname',
 'gid',
 'chrA',
 'endA',    
 'startB',
 'endB',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'nvar',
 'shape1',
 'shape2',
 'dist',
 'npval',
 'slope',
 'ppval',
 'bpval',
 'qval',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_slope',
 'gwas_slope_se',
 'gwas_pval_nominal',
 'SampleSize']]

In [22]:
master.rename(columns={'chrA':'chrom', 'endA': 'snp_pos', 
                       'startB': 'gene_start', 'endB': 'gene_end',
                       'gname': 'gene_name', 'gid': 'gene_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [23]:
master.sort_values(['chrom', 'snp_pos', 'gene_start', 'rs_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [2]:
master[(master.has_fithichip_loop == 1) & (master.is_coloc_pair == 1) & (master.is_eqtl_pair == 1)]

NameError: name 'master' is not defined