In [1]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

In [2]:
ln_clines = ['B_NAIVE', 'CD4_NAIVE', 'CD4_N_STIM', 'CD8_NAIVE',
                 'CD8_N_STIM', 'CLASSICAL_MONOCYTES', 'NK_CD16POS',
                 'NONCLASSICAL_MONOCYTES', 'TFH', 'TH1', 'TH1-17',
                 'TH17', 'TH2', 'TREG_MEMORY','TREG_NAIVE']

sn_clines = ['NB', 'CD4N', 'CD4S', 'CD8N', 
              'CD8S', 'CM', 'NK',
              'NCM', 'TFH', 'TH1', 'TH1-17', 
              'TH17', 'TH2', 'MTREG', 'NTREG']

project_gwas_list = ['T1D_34012112_Gaulton']

In [5]:
summary = []
for gwas in project_gwas_list:
    for ln, sn in zip(ln_clines, sn_clines):
    
        # path to the colocalization results
        coloc_fn = 'results/main/2021_Nikhil_eQTL/Results/Colocalization/'
        coloc_fn += '{}/DICE_eQTL_{}/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
        coloc_fn = coloc_fn.format(gwas, ln)
        
        # path to the sgloops
        sgloop_fn = 'results/main/loop_analysis/{}/sgloop_summary.xlsx'.format(sn)
        
        print(coloc_fn)
        print(sgloop_fn)
        print()
        
        if os.path.exists(coloc_fn) and os.path.exists(sgloop_fn):
            
            print('Processing data for {} {}'.format(gwas, sn))
            
            # load coloc snp information
            snp_info = pd.read_table(coloc_fn)

            # load the sgloops
            sgloop_data = pd.read_excel(sgloop_fn, index_col=[0,1])
            sgloop_data.reset_index(inplace=True)

            # add the coloc data
            sgloop_data = sgloop_data.merge(snp_info, on='rs_id')
            
            sgloop_data['cell'] = sn

            summary.append(sgloop_data)
        

results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/DICE_eQTL_B_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed
results/main/loop_analysis/NB/sgloop_summary.xlsx

Processing data for T1D_34012112_Gaulton NB
results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/DICE_eQTL_CD4_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed
results/main/loop_analysis/CD4N/sgloop_summary.xlsx

Processing data for T1D_34012112_Gaulton CD4N
results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/DICE_eQTL_CD4_N_STIM/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed
results/main/loop_analysis/CD4S/sgloop_summary.xlsx

results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/DICE_eQTL_CD8_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed
results/main/loop_analysis/CD8N/sgloop_summary.xlsx

Processing data for T1D_34012112_Gaulton CD8N
results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/DICE_eQTL_CD8_N_STIM/FINAL_Summary_Coloc_Gene_SNP

In [6]:
summary = pd.concat(summary)

In [7]:
summary.rename(columns={'coloc_egene': 'egene', 'sg_gene': 'near_gene'}, inplace=True)

In [8]:
summary.head(20)

Unnamed: 0,rs_id,egene,near_gene,coloc,fithichip,spp,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,...,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,SampleSize,cell
0,rs11066156,ERP29,ERP29,1,0,1,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
1,rs11066156,ERP29,AC003029.1,0,0,0,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
2,rs11066156,ERP29,ACAD10,0,0,1,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
3,rs11066156,ERP29,ADAM1A,0,0,0,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
4,rs11066156,ERP29,ADAM1B,0,0,0,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
5,rs11066156,ERP29,ALDH2,0,0,1,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
6,rs11066156,ERP29,ATXN2,0,0,1,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
7,rs11066156,ERP29,BRAP,0,0,1,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
8,rs11066156,ERP29,ENSG00000200135.1,0,0,0,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB
9,rs11066156,ERP29,ENSG00000200688.1,0,0,0,chr12,112523970,0.0,0.0,...,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.54e-11,405537,NB


In [9]:
new_order = ['rs_id',
             'egene',
             'near_gene',
             'coloc',
             'fithichip',
             'spp',
             'cell',
             'chr',
             'pos',
             'pp_H0_Coloc_Summary',
             'pp_H1_Coloc_Summary',
             'pp_H2_Coloc_Summary',
             'pp_H3_Coloc_Summary',
             'pp_H4_Coloc_Summary',
             'variant_id',
             'geneName',
             'dist',
             'pvalue',
             'FDR',
             'slope_snp',
             'ref',
             'alt',
             'AC',
             'AF',
             'AN',
             'slope_se_snp',
             'slope_gwas',
             'slope_se_gwas',
             'pval_nominal',
             'SampleSize',
            ]

summary = summary.loc[:, new_order]

In [10]:
summary

Unnamed: 0,rs_id,egene,near_gene,coloc,fithichip,spp,cell,chr,pos,pp_H0_Coloc_Summary,...,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,SampleSize
0,rs11066156,ERP29,ERP29,1,0,1,NB,chr12,112523970,0.000000e+00,...,C,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.540000e-11,405537
1,rs11066156,ERP29,AC003029.1,0,0,0,NB,chr12,112523970,0.000000e+00,...,C,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.540000e-11,405537
2,rs11066156,ERP29,ACAD10,0,0,1,NB,chr12,112523970,0.000000e+00,...,C,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.540000e-11,405537
3,rs11066156,ERP29,ADAM1A,0,0,0,NB,chr12,112523970,0.000000e+00,...,C,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.540000e-11,405537
4,rs11066156,ERP29,ADAM1B,0,0,0,NB,chr12,112523970,0.000000e+00,...,C,T,1882,0.375799,5008,0.000881,-0.16883,0.025851,6.540000e-11,405537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,rs60254670,CTSH,RP11-650L12.1,0,0,0,TFH,chr15,79229959,6.557898e-297,...,TGGCCAGAATG,T,705,0.140775,5008,0.002740,-0.19388,0.021758,5.060000e-19,520580
336,rs60254670,CTSH,RP11-650L12.2,0,0,0,TFH,chr15,79229959,6.557898e-297,...,TGGCCAGAATG,T,705,0.140775,5008,0.002740,-0.19388,0.021758,5.060000e-19,520580
337,rs60254670,CTSH,RPL18P11,0,0,0,TFH,chr15,79229959,6.557898e-297,...,TGGCCAGAATG,T,705,0.140775,5008,0.002740,-0.19388,0.021758,5.060000e-19,520580
338,rs60254670,CTSH,RPL21P116,0,0,0,TFH,chr15,79229959,6.557898e-297,...,TGGCCAGAATG,T,705,0.140775,5008,0.002740,-0.19388,0.021758,5.060000e-19,520580


## List of cells shared by a colocalized SNP and fithichip loop

In [11]:
summary[summary.fithichip == 1]

Unnamed: 0,rs_id,egene,near_gene,coloc,fithichip,spp,cell,chr,pos,pp_H0_Coloc_Summary,...,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,SampleSize
34,rs11171739,RPS26,NABP2,0,1,1,NB,chr12,56470625,0.0,...,C,T,2781,0.555312,5008,0.000107,-0.221374,0.014087,1.19e-55,520580
35,rs11171739,RPS26,RNF41,0,1,1,NB,chr12,56470625,0.0,...,C,T,2781,0.555312,5008,0.000107,-0.221374,0.014087,1.19e-55,520580
111,rs1131936,BTN3A1,HIST1H2BD,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
112,rs1131936,BTN3A1,HIST1H2BD,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
113,rs1131936,BTN3A1,HIST1H3A,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
114,rs1131936,BTN3A1,HIST1H3A,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
115,rs1131936,BTN3A1,HIST1H4A,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
116,rs1131936,BTN3A1,HIST1H4A,0,1,1,NB,chr6,26394320,0.0,...,A,G,1477,0.294928,5008,0.000351,-0.122869,0.021271,7.63e-09,520580
245,rs12149160,RMI2,RMI2,1,1,1,NB,chr16,11439303,0.0,...,G,T,1806,0.360623,5008,0.000656,0.098787,0.014092,2.38e-12,520580
246,rs12149160,RMI2,MIR548H2,0,1,0,NB,chr16,11439303,0.0,...,G,T,1806,0.360623,5008,0.000656,0.098787,0.014092,2.38e-12,520580


In [12]:
shared_sgloops = summary[summary.fithichip == 1].groupby(['rs_id', 'near_gene'])
shared_sgloops = summary[summary.fithichip == 1].groupby(['rs_id', 'near_gene'])

shared_sgloops = shared_sgloops.cell.value_counts()

In [13]:
shared_sgloops

rs_id        near_gene      cell
rs10876864   CD63           NK      1
             RP11-644F5.11  NK      1
rs11171739   NABP2          NB      1
             RNF41          NB      1
rs1131936    HIST1H2BD      NB      2
             HIST1H3A       NB      2
             HIST1H4A       NB      2
rs12149160   MIR548H2       NB      1
             RMI2           NB      1
             SNN            NB      1
             ZC3H7A         NB      1
rs1310183    HIPK1          NB      1
             RP5-1073O3.7   NB      1
rs138612994  RP11-705C15.2  TFH     1
             RP11-705C15.4  TFH     1
             RP11-705C15.5  TFH     1
rs2816316    RP5-1011O1.2   NB      1
             TROVE2         NB      1
             UCHL5          NB      1
             snoU109        NB      1
rs2847258    PTPN2          NK      1
             SEH1L          NK      1
rs439558     CRHR1          CD8N    1
rs479777     FERMT3         CM      1
             PYGM           CD4N    1
             RASG

In [18]:
shared_sgloops = summary[summary.fithichip == 1].groupby(['rs_id', 'near_gene'])
shared_sgloops = summary[summary.fithichip == 1].groupby(['rs_id', 'near_gene'])

shared_sgloops = shared_sgloops.cell.unique()
shared_sgloops = shared_sgloops.to_frame().reset_index()
shared_sgloops = shared_sgloops.merge(snp_info, on='rs_id')
new_order = ['rs_id',
 'geneName',
 'near_gene',
 'cell',
 'chr',
 'pos',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'variant_id',
 'dist',
 'pvalue',
 'FDR',
 'slope_snp',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'slope_se_snp',
 'slope_gwas',
 'slope_se_gwas',
 'pval_nominal',
 'SampleSize']
shared_sgloops = shared_sgloops.loc[:, new_order]
shared_sgloops.rename(columns={'geneName': 'egene'}, inplace=True)

In [19]:
shared_sgloops.set_index(['rs_id', 'egene'])

Unnamed: 0_level_0,Unnamed: 1_level_0,near_gene,cell,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,pp_H2_Coloc_Summary,pp_H3_Coloc_Summary,pp_H4_Coloc_Summary,variant_id,...,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,SampleSize
rs_id,egene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
rs10876864,RPS26,CD63,[NK],chr12,56401085,0.0,0.0,6.651247e-28,0.0,1.0,rs10876864,...,G,A,2796,0.558307,5008,4.2e-05,-0.226848,0.014116,4.16e-58,520580
rs10876864,RPS26,RP11-644F5.11,[NK],chr12,56401085,0.0,0.0,6.651247e-28,0.0,1.0,rs10876864,...,G,A,2796,0.558307,5008,4.2e-05,-0.226848,0.014116,4.16e-58,520580
rs138612994,RP11-75L1.1,RP11-705C15.2,[TFH],chr12,9834209,0.0,0.0,0.01850354,0.0,0.981496,rs138612994:9834209:A:AT,...,A,AT,2429,0.485024,5008,0.000613,-0.085572,0.013969,9.03e-10,520580
rs138612994,RP11-75L1.1,RP11-705C15.4,[TFH],chr12,9834209,0.0,0.0,0.01850354,0.0,0.981496,rs138612994:9834209:A:AT,...,A,AT,2429,0.485024,5008,0.000613,-0.085572,0.013969,9.03e-10,520580
rs138612994,RP11-75L1.1,RP11-705C15.5,[TFH],chr12,9834209,0.0,0.0,0.01850354,0.0,0.981496,rs138612994:9834209:A:AT,...,A,AT,2429,0.485024,5008,0.000613,-0.085572,0.013969,9.03e-10,520580
rs479777,AP003774.1,FERMT3,[CM],chr11,64107477,0.0,0.0,0.0331612,0.0,0.966839,rs479777,...,T,C,1094,0.21845,5008,0.000629,-0.099132,0.015066,4.71e-11,520580
rs479777,AP003774.1,PYGM,[CD4N],chr11,64107477,0.0,0.0,0.0331612,0.0,0.966839,rs479777,...,T,C,1094,0.21845,5008,0.000629,-0.099132,0.015066,4.71e-11,520580
rs479777,AP003774.1,RASGRP2,[CD4N],chr11,64107477,0.0,0.0,0.0331612,0.0,0.966839,rs479777,...,T,C,1094,0.21845,5008,0.000629,-0.099132,0.015066,4.71e-11,520580
rs479777,AP003774.1,STIP1,[CM],chr11,64107477,0.0,0.0,0.0331612,0.0,0.966839,rs479777,...,T,C,1094,0.21845,5008,0.000629,-0.099132,0.015066,4.71e-11,520580


# Make the master table

In [388]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

# default values for the command line
sys.argv = [0] * 7
sys.argv[1] =  'results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/'
sys.argv[1] += 'BLUEPRINT_eQTL_Monocyte/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
sys.argv[2] = 'results/refs/ensembl/gencode.v19.annotation.bed'
sys.argv[3] = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_L/FitHiChIP.interactions_FitHiC_Q0.01.bed'
sys.argv[4] = 'results/refs/spp/SPP_D-Challenge_networks.xlsx'
sys.argv[5] = 'results/refs/hg19/hg19.chrom.sizes'
sys.argv[6] = 'results/main/loop_analysis/washU/'

# parsing the commandline arguments
coloc_fn = sys.argv[1]
genes_fn = sys.argv[2]
loop_fn = sys.argv[3]
spp_fn = sys.argv[4]
gs_fn = sys.argv[5]
outdir = sys.argv[6]

# setting the output file names
os.makedirs(outdir, exist_ok=True)
summary_fn = os.path.join(outdir, 'sgloop_summary.xlsx')
sg_pairs_fn = os.path.join(outdir, 'gs_pairs.longrange.bed')
sg_loops_fn = os.path.join(outdir, 'gs_loops.longrange.bed')

## Load the colocalization data

In [389]:
# load the colocalization data
coloc = pd.read_table(coloc_fn)

# extract the most significant according the H4 
coloc_sig_df = coloc[coloc['pp_H4_Coloc_Summary'] > 0.75]
coloc_sig_df.rename(columns={'pos': 'end'}, inplace=True)
coloc_sig_df.loc[:, 'start'] = coloc_sig_df.loc[:, 'end'] - 1
coloc_sig_full = coloc_sig_df.copy(deep=True)

In [390]:
coloc_sig_df = coloc_sig_df[['chr', 'start', 'end', 'rs_id', 'variant_id']]
coloc_sig_df = coloc_sig_df.loc[~coloc_sig_df.duplicated(subset='rs_id'),]
coloc_sig_pbt = pbt.BedTool.from_dataframe(coloc_sig_df.iloc[:, 0:4]).sort()

In [391]:
#csnp_slop_pbt = coloc_sig_pbt.slop(b=500000, g=gs_fn)

## Load the gene data

In [392]:
# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gene_name']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode.type.isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5]]
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

In [393]:
genes_df.head()

Unnamed: 0,chrom,start,end,gene_name,gene_id
0,chr1,11869,14412,DDX11L1,ENSG00000223972.4
21,chr1,14363,29806,WASH7P,ENSG00000227232.4
82,chr1,29554,31109,MIR1302-11,ENSG00000243485.2
92,chr1,34554,36081,FAM138A,ENSG00000237613.2
100,chr1,52473,54936,OR4G4P,ENSG00000268020.2


## Find the closest gene

In [394]:
closest_gene = coloc_sig_pbt.closest(genes_pbt, d=True)
closest_gene = closest_gene.to_dataframe()
closest_gene = closest_gene.iloc[:, [3,7,8,9]]
closest_gene.columns = ['rs_id', 'cls_gname', 'cls_id', 'cls_dist']
closest_gene.head()

Unnamed: 0,rs_id,cls_gname,cls_id,cls_dist
0,rs11102694,AP4B1-AS1,ENSG00000226167.1,0
1,rs11102694,BCL2L15,ENSG00000188761.7,0
2,rs2760530,RP5-1011O1.2,ENSG00000232498.1,2149
3,rs479777,CCDC88B,ENSG00000168071.17,219
4,rs7956831,CLECL1,ENSG00000184293.3,3262


In [395]:
uniq_cls_gname = closest_gene.groupby(['rs_id']).cls_gname.apply(lambda x: ','.join(x))
uniq_cls_ids = closest_gene.groupby(['rs_id']).cls_id.apply(lambda x: ','.join(x))
uniq_cls_dist = closest_gene.groupby(['rs_id']).cls_dist.apply(lambda x: ','.join([str(i) for i in x]))
uniq_cls = pd.merge(uniq_cls_gname, uniq_cls_ids, left_index=True, right_index=True)
uniq_cls = pd.merge(uniq_cls, uniq_cls_dist, left_index=True, right_index=True)
uniq_cls.reset_index(inplace=True)

In [396]:
uniq_cls

Unnamed: 0,rs_id,cls_gname,cls_id,cls_dist
0,rs10085721,SKAP2,ENSG00000005020.8,0
1,rs11102694,"AP4B1-AS1,BCL2L15","ENSG00000226167.1,ENSG00000188761.7",0
2,rs112436750,MAPT-AS1,ENSG00000264589.1,0
3,rs112445263,PRKD2,ENSG00000105287.8,0
4,rs1131017,RPS26,ENSG00000197728.5,0
5,rs1790974,"CD226,DOK6","ENSG00000150637.4,ENSG00000206052.6",0
6,rs2289702,CTSH,ENSG00000103811.11,0
7,rs2760530,RP5-1011O1.2,ENSG00000232498.1,2149
8,rs3216621,BAK1,ENSG00000030110.8,71
9,rs35662477,MAPKAPK5,ENSG00000089022.9,0


## Find all genes +/- 500kb

In [397]:
# get a list of gene names within +- 500kb of the SNPs
fivekb_gnames = coloc_sig_pbt.slop(b=500000, g=gs_fn)
fivekb_gnames = fivekb_gnames.map(genes_pbt, c=4, o='collapse')
fivekb_gnames = fivekb_gnames.to_dataframe()
fivekb_gnames = fivekb_gnames.iloc[:, [3,4]]
fivekb_gnames.columns = ['rs_id', 'gene_name']

# get a list of gene ids within +- 500kb of the SNPs
fivekb_gids = coloc_sig_pbt.slop(b=500000, g=gs_fn)
fivekb_gids = fivekb_gids.map(genes_pbt, c=5, o='collapse')
fivekb_gids = fivekb_gids.to_dataframe()
fivekb_gids = fivekb_gids.iloc[:, [3,4]]
fivekb_gids.columns = ['rs_id', 'gene_id']

# merge the two above results
fivekb_genes = fivekb_gnames.merge(fivekb_gids, on='rs_id')
fivekb_genes.columns = ['rs_id', '5kb_gname', '5kb_gid']

In [398]:
# get eQTL's
eqtl_fn = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/BLUEPRINT_eQTL/Monocyte.txt.gz'
eqtls = pd.read_table(eqtl_fn)
eqtls.columns = ['eqtl_gname', 'nvar', 'shape1', 'shape2', 'dummy',
                 'sid', 'dist', 'npval', 'slope', 'ppval', 'bpval', 'qval']

## Generate the master table

In [443]:
master = coloc_sig_full.copy()

# add sid which is the chr:position of the SNP
master['sid'] = master['chr'].str.replace('chr', '') + ':' +  master['end'].astype(str)

# add the closest gene
master = master.merge(uniq_cls, on='rs_id', how='left')

# add the +/- fivekb genes 
master = master.merge(fivekb_genes, on='rs_id', how='left')

# add the eQTL data
eqtl_genes = master.merge(eqtls[['sid', 'eqtl_gname']], on='sid')
eqtl_genes = eqtl_genes.groupby('rs_id').eqtl_gname.unique()
eqtl_genes = eqtl_genes.apply(lambda x: ','.join(x))
master = master.merge(eqtl_genes, on='rs_id', how='left')

new_colnames = [
 'rs_id',
 'variant_id',
 'chr',
 'start',
 'end',           
 'geneName',  
 'eqtl_gname',
 'cls_gname',

 '5kb_gname',

 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',           
 'dist',
 'pvalue',
 'FDR',
 'slope_snp',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'slope_se_snp',
 'slope_gwas',
 'slope_se_gwas',
 'pval_nominal',
 'SampleSize',
 'sid',
 'cls_id',
 'cls_dist',
 '5kb_gid'
]
master = master.loc[:, new_colnames]
master.rename(columns={'geneName': 'coloc_gname',
                       'end': 'pos',
                       'eqtl_gname': 'eqtl_gnames', 
                       'cls_gname': 'cls_gnames', 
                       'cls_id': 'cls_ids'}, inplace=True)
master.drop(['start'], axis=1, inplace=True)

In [444]:
master

Unnamed: 0,rs_id,variant_id,chr,pos,coloc_gname,eqtl_gnames,cls_gnames,5kb_gname,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,...,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,SampleSize,sid,cls_ids,cls_dist,5kb_gid
0,rs11102694,rs11102694:114426001:G:A,chr1,114426001,AP4B1,AP4B1,"AP4B1-AS1,BCL2L15","MAGI3,RP11-512F24.1,MTND5P20,PHTF1,RP4-730K3.3...",1.1462089999999999e-195,1.0297180000000001e-106,...,5008,0.004482,0.379376,0.017082,2.7799999999999997e-109,520580,1:114426001,"ENSG00000226167.1,ENSG00000188761.7",0,"ENSG00000081026.14,ENSG00000232499.2,ENSG00000..."
1,rs2760530,rs2760530,chr1,192538496,RGS1,RGS1,RP5-1011O1.2,"RGS18,RP11-142L4.3,RP11-142L4.2,RGS21,AL136987...",0.0,0.0,...,5008,0.001806,0.087511,0.015961,4.19e-08,520580,1:192538496,ENSG00000232498.1,2149,"ENSG00000150681.5,ENSG00000226723.2,ENSG000002..."
2,rs11102694,rs11102694:114426001:G:A,chr1,114426001,AP4B1,AP4B1,"AP4B1-AS1,BCL2L15","MAGI3,RP11-512F24.1,MTND5P20,PHTF1,RP4-730K3.3...",2.717076e-149,1.152067e-80,...,5008,0.004482,0.379376,0.017082,2.7799999999999997e-109,520580,1:114426001,"ENSG00000226167.1,ENSG00000188761.7",0,"ENSG00000081026.14,ENSG00000232499.2,ENSG00000..."
3,rs9467740,rs9467740:26383250:A:T,chr6,26383250,BTN2A2,BTN2A2,BTN2A2,"SLC17A2,TRIM38,U91328.21,U91328.20,U91328.19,U...",0.0,0.0,...,5008,0.003806,0.114649,0.019235,2.51e-09,520580,6:26383250,ENSG00000124508.12,75,"ENSG00000112337.6,ENSG00000112343.8,ENSG000002..."
4,rs3216621,rs3216621:33548090:A:AG,chr6,33548090,BAK1,BAK1,BAK1,"HLA-DPA1,HLA-DPB1,HLA-DPA2,COL11A2P1,HLA-DPB2,...",0.0,0.0,...,5008,0.002417,0.145697,0.01575,2.23e-20,520580,6:33548090,ENSG00000030110.8,71,"ENSG00000231389.3,ENSG00000223865.6,ENSG000002..."
5,rs3216621,rs3216621:33548090:A:AG,chr6,33548090,BAK1,BAK1,BAK1,"HLA-DPA1,HLA-DPB1,HLA-DPA2,COL11A2P1,HLA-DPB2,...",0.0,0.0,...,5008,0.002417,0.145697,0.01575,2.23e-20,520580,6:33548090,ENSG00000030110.8,71,"ENSG00000231389.3,ENSG00000223865.6,ENSG000002..."
6,rs9467740,rs9467740:26383250:A:T,chr6,26383250,BTN2A2,BTN2A2,BTN2A2,"SLC17A2,TRIM38,U91328.21,U91328.20,U91328.19,U...",0.0,0.0,...,5008,0.003806,0.114649,0.019235,2.51e-09,520580,6:26383250,ENSG00000124508.12,75,"ENSG00000112337.6,ENSG00000112343.8,ENSG000002..."
7,rs10085721,rs10085721:26819917:T:A,chr7,26819917,SKAP2,SKAP2,SKAP2,"SNX10,AC004540.4,AC004540.5,KIAA0087,AC004947....",3.847042e-10,1.18121e-06,...,5008,0.012783,-0.099421,0.014881,2.37e-11,520580,7:26819917,ENSG00000005020.8,0,"ENSG00000086300.11,ENSG00000225792.1,ENSG00000..."
8,rs479777,rs479777,chr11,64107477,AP003774.1,AP003774.1,CCDC88B,"MARK2,RNU6-1306P,RCOR2,NAA40,RNU6-45P,COX8A,AP...",0.0,0.0,...,5008,0.002882,-0.099132,0.015066,4.71e-11,520580,11:64107477,ENSG00000168071.17,219,"ENSG00000072518.16,ENSG00000202089.1,ENSG00000..."
9,rs35662477,rs35662477:112313208:A:T,chr12,112313208,TMEM116,TMEM116,MAPKAPK5,"RP3-473L9.4,HSPA8P14,SH2B3,ATXN2,AC002395.1,U7...",0.0,0.0,...,5008,0.001181,-0.185236,0.026215,1.6e-12,405537,12:112313208,ENSG00000089022.9,0,"ENSG00000257595.2,ENSG00000257539.2,ENSG000001..."
