In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 10000

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [2]:
gwas = 'results/main/gwas/2021_chiou_et_al/gwas.finemapping.supp_table3.tsv'
gwas_df = pd.read_table(gwas)

gwas_df['Chrom.'] = gwas_df['Chrom.'].astype(int)
gwas_df['Position (hg19)'] = gwas_df['Position (hg19)'].astype(int)
gwas_df['Position (hg38)'] = gwas_df['Position (hg38)'].astype(int)

gwas_bed = gwas_df.iloc[:, [1,2,2]]
#gwas_bed = gwas_df.iloc[:, [1,3,3]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gwas_bed['start'] = gwas_bed['start'] - 1


## Intersect Fine Mapped GWAS and loops

In [3]:
def parse_seB(x):
    
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [4]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

In [5]:
loop_data = []
for loop in loops:
    print(loop)
    
    cline = loop.split('/')[5]
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    
    df['chrom'] = df['chrom'].str.replace('chr', '')
    
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    
    df['startA'] = df['startA'] + 1 
    df['endA'] = df['startA'] + res

    df['startB'] = df['startB'] + 1 
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGMEM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH2/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NCM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGNAIVE/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH1/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/THSTAR/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/Fit

In [6]:
loop_df = pd.concat(loop_data)
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

In [7]:
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt, type='either')
#intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(b=100000, g=gsizes), type='either')
gwas_hichip = intersect_pbt.to_dataframe()

In [8]:
gwas_hichip

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb,blockCount
0,1,25292500,25302500,1,25357500,25367500,TREGMEM,1,25296742,25296743
1,1,114372500,114382500,1,114447500,114457500,TREGMEM,1,114377567,114377568
2,1,114372500,114382500,1,114472500,114482500,TREGMEM,1,114377567,114377568
3,1,114377500,114387500,1,114447500,114457500,TREGMEM,1,114377567,114377568
4,1,114377500,114387500,1,114472500,114482500,TREGMEM,1,114377567,114377568
...,...,...,...,...,...,...,...,...,...,...
4293,8,141607500,141617500,8,141642500,141652500,TH17,8,141616182,141616183
4294,8,141607500,141617500,8,141647500,141657500,TH17,8,141616182,141616183
4295,8,141607500,141617500,8,141802500,141812500,TH17,8,141616182,141616183
4296,8,141607500,141617500,8,142137500,142147500,TH17,8,141616182,141616183


In [9]:
gwas_hichip = gwas_hichip.iloc[:, [7,8,9,0,1,2,3,4,5,6]]
loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'start_snp', 'end_snp'] + loop_cols + ['cline_loop']
gwas_hichip = gwas_hichip.merge(gwas_df, left_on=['chr_snp', 'end_snp'], right_on=['Chrom.', 'Position (hg19)'])
gwas_hichip.drop('start_snp', axis=1, inplace=True)
gwas_hichip.rename(columns={'end_snp': 'position_snp'}, inplace=True)
gwas_hichip['sid'] = 'chr' +  gwas_hichip['chr_snp'].astype(str) + ':' + gwas_hichip['position_snp'].astype(str)

# add loop ids
def make_lid(sr, cols):
    lid = sr[cols].tolist()
    lid = [str(x) for x in lid]
    lid = ':'.join(lid)
    return(lid)

lid_cols = [2,3,4,5,6,7]
lids = []
for sr in gwas_hichip.values: 
    new_lid = make_lid(sr, lid_cols)
    lids.append(new_lid)
gwas_hichip['loop_id'] = lids

In [10]:
gwas_hichip.head()

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,Marker,...,Position (hg38),Allele Ref.,Alt.,Signal name,Alt.AF,PPA,Previous studies Index (r2),PMID,sid,loop_id
0,1,25296743,1,25292500,25302500,1,25357500,25367500,TREGMEM,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500
1,1,25296743,1,25257500,25267500,1,25292500,25302500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25257500:25267500:1:25292500:25302500
2,1,25296743,1,25292500,25302500,1,25347500,25357500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25347500:25357500
3,1,25296743,1,25292500,25302500,1,25357500,25367500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500
4,1,25296743,1,25292500,25302500,1,25357500,25367500,NCM,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500


## Summarizing

In [11]:
total_gwas = gwas_df.shape[0]
# # summarize the total number of GWAS loops per cell # DOESN't MAKE SENSE
# cell_summary['total_gwas'] = gwas_hichip.groupby('cline_loop').nunique('sid')['chr_snp']
# cell_summary['total_gwas'] = cell_summary['total_gwas'].to_frame()
# cell_summary['total_gwas'].columns = ['Total GWAS SNPs']
# cell_summary['total_gwas']

In [12]:
cell_summary = {}

In [13]:
#summarize the total number of loops per cell
cell_summary['total_loops'] = loop_df.groupby('cline').count()['startA'].to_frame()
cell_summary['total_loops'].columns = ['total_hichip']
cell_summary['total_loops']

Unnamed: 0_level_0,total_hichip
cline,Unnamed: 1_level_1
CD4N,114421
CD8N,84599
CM,84298
NB,128288
NCM,103342
NK,129890
TFH,46172
TH1,63241
TH17,76270
TH2,58115


In [14]:
gwas_hichip['cline_loop']

0       TREGMEM
1           TH2
2           TH2
3           TH2
4           NCM
         ...   
4293         NK
4294         NK
4295       TH17
4296       TH17
4297         NK
Name: cline_loop, Length: 4298, dtype: object

In [15]:
# summarize the number of gl pairs per cell
cell_summary['gl_pairs'] = gwas_hichip['cline_loop'].value_counts().to_frame()
cell_summary['gl_pairs'].columns = ['glpairs']
cell_summary['gl_pairs']

Unnamed: 0,glpairs
NB,673
NK,494
CD4N,460
TH17,374
NCM,324
CD8N,319
TH2,270
CM,266
TH1,254
TREGMEM,249


In [16]:
gwas_hichip

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,Marker,...,Position (hg38),Allele Ref.,Alt.,Signal name,Alt.AF,PPA,Previous studies Index (r2),PMID,sid,loop_id
0,1,25296743,1,25292500,25302500,1,25357500,25367500,TREGMEM,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500
1,1,25296743,1,25257500,25267500,1,25292500,25302500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25257500:25267500:1:25292500:25302500
2,1,25296743,1,25292500,25302500,1,25347500,25357500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25347500:25357500
3,1,25296743,1,25292500,25302500,1,25357500,25367500,TH2,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500
4,1,25296743,1,25292500,25302500,1,25357500,25367500,NCM,rs10751776,...,24970252,A,C,RUNX3 (1:25296743:A:C),0.509915,0.040594,,,chr1:25296743,1:25292500:25302500:1:25357500:25367500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4293,22,37535948,22,37517500,37527500,22,37532500,37542500,NK,rs228963,...,37139908,A,G,C1QTNF6 (22:37535948:A:G),0.438758,0.264508,,,chr22:37535948,22:37517500:37527500:22:37532500:37542500
4294,22,37535948,22,37532500,37542500,22,39487500,39497500,NK,rs228963,...,37139908,A,G,C1QTNF6 (22:37535948:A:G),0.438758,0.264508,,,chr22:37535948,22:37532500:37542500:22:39487500:39497500
4295,22,37535948,22,37517500,37527500,22,37532500,37542500,TH17,rs228963,...,37139908,A,G,C1QTNF6 (22:37535948:A:G),0.438758,0.264508,,,chr22:37535948,22:37517500:37527500:22:37532500:37542500
4296,22,37535948,22,37532500,37542500,22,37882500,37892500,TH17,rs228963,...,37139908,A,G,C1QTNF6 (22:37535948:A:G),0.438758,0.264508,,,chr22:37535948,22:37532500:37542500:22:37882500:37892500


In [17]:
# summarize the number of unique GWAS SNPs which overlap a HiChIP loop cell)
cell_summary['uniq_gwas'] = gwas_hichip.groupby('cline_loop')['sid'].nunique().to_frame()
cell_summary['uniq_gwas'].columns = ['uniq_gwas_in_glpairs']
cell_summary['uniq_gwas']

Unnamed: 0_level_0,uniq_gwas_in_glpairs
cline_loop,Unnamed: 1_level_1
CD4N,55
CD8N,46
CM,43
NB,62
NCM,68
NK,62
TFH,38
TH1,54
TH17,57
TH2,45


In [18]:
# summarize the number of loops with GWAS overlaps (per cell)
loop_cols = ['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop', 'endB_loop']
cell_summary['uniq_loops'] = gwas_hichip.groupby('cline_loop')['loop_id'].nunique().to_frame()
cell_summary['uniq_loops'].columns = ['uniq_loops_in_glpairs']
cell_summary['uniq_loops']

Unnamed: 0_level_0,uniq_loops_in_glpairs
cline_loop,Unnamed: 1_level_1
CD4N,440
CD8N,305
CM,264
NB,650
NCM,313
NK,476
TFH,174
TH1,239
TH17,346
TH2,251


In [19]:
concat_list = [cell_summary['total_loops'], cell_summary['gl_pairs'],
               cell_summary['uniq_gwas'], cell_summary['uniq_loops']]
summary = pd.concat(concat_list, axis=1)
summary['pct_uniq_gwas_in_glpairs'] = summary['uniq_gwas_in_glpairs'] / total_gwas * 100
summary['pct_uniq_loops_in_glpairs'] = summary['uniq_loops_in_glpairs'] / summary['total_hichip'] * 100

In [20]:
summary

Unnamed: 0,total_hichip,glpairs,uniq_gwas_in_glpairs,uniq_loops_in_glpairs,pct_uniq_gwas_in_glpairs,pct_uniq_loops_in_glpairs
CD4N,114421,460,55,440,40.441176,0.384545
CD8N,84599,319,46,305,33.823529,0.360524
CM,84298,266,43,264,31.617647,0.313175
NB,128288,673,62,650,45.588235,0.506672
NCM,103342,324,68,313,50.0,0.302878
NK,129890,494,62,476,45.588235,0.366464
TFH,46172,182,38,174,27.941176,0.376852
TH1,63241,254,54,239,39.705882,0.377919
TH17,76270,374,57,346,41.911765,0.453652
TH2,58115,270,45,251,33.088235,0.431902


In [21]:
final_summary = summary.copy()

In [22]:
final_colnames = ['Total\\nHiChIP Loops', 
                  'Number of\\nGWAS-Loop Pairs',
                  'Number of\\nUnique GWAS SNPs in GL Pairs', 
                  'Number of\\nUnique loops in GL Pairs',
                  'Percentage of\\nUnique GWAS SNPs in GL Pairs', 
                  'Percentage of\\nUnique loops in GL Pairs']
final_colnames = ['Total HiChIP Loops', 
                  'Number of GWAS-Loop Pairs',
                  'Number of Unique GWAS SNPs in GL Pairs', 
                  'Number of Unique loops in GL Pairs',
                  'Percentage of Unique GWAS SNPs in GL Pairs', 
                  'Percentage of Unique loops in GL Pairs']
final_summary.columns = final_colnames

In [23]:
display(HTML(final_summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Total HiChIP Loops,Number of GWAS-Loop Pairs,Number of Unique GWAS SNPs in GL Pairs,Number of Unique loops in GL Pairs,Percentage of Unique GWAS SNPs in GL Pairs,Percentage of Unique loops in GL Pairs
CD4N,114421,460,55,440,40.441176,0.384545
CD8N,84599,319,46,305,33.823529,0.360524
CM,84298,266,43,264,31.617647,0.313175
NB,128288,673,62,650,45.588235,0.506672
NCM,103342,324,68,313,50.0,0.302878
NK,129890,494,62,476,45.588235,0.366464
TFH,46172,182,38,174,27.941176,0.376852
TH1,63241,254,54,239,39.705882,0.377919
TH17,76270,374,57,346,41.911765,0.453652
TH2,58115,270,45,251,33.088235,0.431902
