In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 10000

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [2]:
gwas = 'results/main/gwas/2021_chiou_et_al/gwas.finemapping.supp_table3.tsv'
gwas_df = pd.read_table(gwas)

gwas_df['Chrom.'] = gwas_df['Chrom.'].astype(int)
gwas_df['Position (hg19)'] = gwas_df['Position (hg19)'].astype(int)
gwas_df['Position (hg38)'] = gwas_df['Position (hg38)'].astype(int)

gwas_bed = gwas_df.iloc[:, [1,2,2]]
#gwas_bed = gwas_df.iloc[:, [1,3,3]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gwas_bed['start'] = gwas_bed['start'] - 1


## Intersect Fine Mapped GWAS and loops

In [3]:
def parse_seB(x):
    
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [4]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

In [5]:
loop_data = []
for loop in loops:
    print(loop)
    
    cline = loop.split('/')[5]
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    
    df['chrom'] = df['chrom'].str.replace('chr', '')
    
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    
    df['startA'] = df['startA'] + 1 
    df['endA'] = df['startA'] + res

    df['startB'] = df['startB'] + 1 
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGMEM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH2/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NCM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGNAIVE/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH1/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/THSTAR/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/Fit

In [6]:
loop_df = pd.concat(loop_data)
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

In [7]:
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt, type='either')
#intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(b=100000, g=gsizes), type='either')
gwas_hichip = intersect_pbt.to_dataframe()

In [8]:
gwas_hichip

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb,blockCount
0,1,25292500,25302500,1,25357500,25367500,TREGMEM,1,25296742,25296743
1,1,114372500,114382500,1,114447500,114457500,TREGMEM,1,114377567,114377568
2,1,114372500,114382500,1,114472500,114482500,TREGMEM,1,114377567,114377568
3,1,114377500,114387500,1,114447500,114457500,TREGMEM,1,114377567,114377568
4,1,114377500,114387500,1,114472500,114482500,TREGMEM,1,114377567,114377568
...,...,...,...,...,...,...,...,...,...,...
4293,8,141607500,141617500,8,141642500,141652500,TH17,8,141616182,141616183
4294,8,141607500,141617500,8,141647500,141657500,TH17,8,141616182,141616183
4295,8,141607500,141617500,8,141802500,141812500,TH17,8,141616182,141616183
4296,8,141607500,141617500,8,142137500,142147500,TH17,8,141616182,141616183


In [9]:
gwas_hichip = gwas_hichip.iloc[:, [7,8,9,0,1,2,3,4,5,6]]
loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'start_snp', 'end_snp'] + loop_cols + ['cline_loop']
gwas_hichip = gwas_hichip.merge(gwas_df, left_on=['chr_snp', 'end_snp'], right_on=['Chrom.', 'Position (hg19)'])
gwas_hichip.drop('start_snp', axis=1, inplace=True)
gwas_hichip.rename(columns={'end_snp': 'position_snp'}, inplace=True)

In [10]:
gwas_hichip.head()

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,Marker,Chrom.,Position (hg19),Position (hg38),Allele Ref.,Alt.,Signal name,Alt.AF,PPA,Previous studies Index (r2),PMID
0,1,25296743,1,25292500,25302500,1,25357500,25367500,TREGMEM,rs10751776,1,25296743,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,
1,1,25296743,1,25257500,25267500,1,25292500,25302500,TH2,rs10751776,1,25296743,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,
2,1,25296743,1,25292500,25302500,1,25347500,25357500,TH2,rs10751776,1,25296743,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,
3,1,25296743,1,25292500,25302500,1,25357500,25367500,TH2,rs10751776,1,25296743,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,
4,1,25296743,1,25292500,25302500,1,25357500,25367500,NCM,rs10751776,1,25296743,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,


In [11]:
summary = gwas_hichip['cline_loop'].value_counts().to_frame()

In [12]:
summary.columns = ['Number of HiChIP loops\\nOverlapping T1D GWAS']

In [13]:
display(HTML(summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Number of HiChIP loops Overlapping T1D GWAS
NB,673
NK,494
CD4N,460
TH17,374
NCM,324
CD8N,319
TH2,270
CM,266
TH1,254
TREGMEM,249
