# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [1]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200331_gc_stratify
BED_DIR=$ANALYSIS/beds/20200403_D0_lt_0.2_quantile_gc10/
BED_NAMES=$(ls $BED_DIR/  | grep "gc" | xargs -I{} basename -s ".bed" {})
RES=$BASE/resources/

In [2]:
echo $BED_NAMES

gc_0.22_0.345 gc_0.345_0.375 gc_0.375_0.4 gc_0.4_0.425 gc_0.425_0.445 gc_0.445_0.465 gc_0.465_0.485 gc_0.485_0.515 gc_0.515_0.555 gc_0.555_0.89


In [3]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# arguments are REF_BED
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $BED_NAMES
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a <(cat $BED_DIR/${i}.bed | awk -v OFS='\t' '{mid=(($2+$3)/2); print $1,mid-500,mid+500}') -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $BED_DIR/${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [94]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 13360

Cluster         Intersect  Total  Percent
gc_0.22_0.345   458        9186   4.9858
gc_0.345_0.375  603        9186   6.5643
gc_0.375_0.4    691        9186   7.5223
gc_0.4_0.425    642        9185   6.9897
gc_0.425_0.445  598        9186   6.5099
gc_0.445_0.465  522        9186   5.6826
gc_0.465_0.485  414        9185   4.5073
gc_0.485_0.515  337        9186   3.6686
gc_0.515_0.555  279        9186   3.0372
gc_0.555_0.89   185        9185   2.0142


In [37]:
# Intersect peaks in each cluster with Histone ChIP peaks from GSE62777 (Cachiarelli et al 2015)
zcat /oak/stanford/groups/akundaje/surag/GEO/GSE62777/croo/5dd_DOX_plus_H3K4me1/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 74899

Cluster         Intersect  Total  Percent
gc_0.22_0.345   4061       9186   44.2086
gc_0.345_0.375  3186       9186   34.6832
gc_0.375_0.4    2583       9186   28.1189
gc_0.4_0.425    1919       9185   20.8928
gc_0.425_0.445  1559       9186   16.9715
gc_0.445_0.465  1370       9186   14.9140
gc_0.465_0.485  1125       9185   12.2482
gc_0.485_0.515  977        9186   10.6358
gc_0.515_0.555  731        9186   7.9578
gc_0.555_0.89   365        9185   3.9739


In [90]:
### Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF 

TOTAL PEAKS : 149765

Cluster         Intersect  Total  Percent
gc_0.22_0.345   1192       9186   12.9763
gc_0.345_0.375  1913       9186   20.8252
gc_0.375_0.4    2515       9186   27.3786
gc_0.4_0.425    2801       9185   30.4954
gc_0.425_0.445  2886       9186   31.4174
gc_0.445_0.465  2822       9186   30.7207
gc_0.465_0.485  2770       9185   30.1579
gc_0.485_0.515  2506       9186   27.2806
gc_0.515_0.555  2330       9186   25.3647
gc_0.555_0.89   2024       9185   22.0359


In [80]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 188521

Cluster         Intersect  Total  Percent
gc_0.22_0.345   3426       9186   37.2959
gc_0.345_0.375  2720       9186   29.6103
gc_0.375_0.4    2239       9186   24.3740
gc_0.4_0.425    1704       9185   18.5520
gc_0.425_0.445  1520       9186   16.5469
gc_0.445_0.465  1391       9186   15.1426
gc_0.465_0.485  1274       9185   13.8704
gc_0.485_0.515  1096       9186   11.9312
gc_0.515_0.555  907        9186   9.8737
gc_0.555_0.89   679        9185   7.3925


In [81]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

printf "OCT4\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\nSOX2\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\nKLF4\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\nMYC\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

OCT4
TOTAL PEAKS : 58148

Cluster         Intersect  Total  Percent
gc_0.22_0.345   1409       9186   15.3386
gc_0.345_0.375  1294       9186   14.0867
gc_0.375_0.4    1184       9186   12.8892
gc_0.4_0.425    971        9185   10.5716
gc_0.425_0.445  879        9186   9.5689
gc_0.445_0.465  817        9186   8.8940
gc_0.465_0.485  710        9185   7.7300
gc_0.485_0.515  565        9186   6.1507
gc_0.515_0.555  400        9186   4.3545
gc_0.555_0.89   216        9185   2.3517

SOX2
TOTAL PEAKS : 64527

Cluster         Intersect  Total  Percent
gc_0.22_0.345   2654       9186   28.8918
gc_0.345_0.375  1816       9186   19.7692
gc_0.375_0.4    1246       9186   13.5641
gc_0.4_0.425    771        9185   8.3941
gc_0.425_0.445  538        9186   5.8567
gc_0.445_0.465  384        9186   4.1803
gc_0.465_0.485  261        9185   2.8416
gc_0.485_0.515  155        9186   1.6874
gc_0.515_0.555  94         9186   1.0233
gc_0.555_0.89   13         9185   0.1415

KLF4
TOTAL PEAKS : 61034

Cluster  

In [75]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\nCC \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/CC_hg38/cromwell-executions/atac/8a3ac2dd-002d-41c5-88d6-a8c0787309dd/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster         Intersect  Total  Percent
gc_0.22_0.345   493        9186   5.3669
gc_0.345_0.375  351        9186   3.8210
gc_0.375_0.4    302        9186   3.2876
gc_0.4_0.425    328        9185   3.5710
gc_0.425_0.445  247        9186   2.6889
gc_0.445_0.465  242        9186   2.6344
gc_0.465_0.485  214        9185   2.3299
gc_0.485_0.515  191        9186   2.0793
gc_0.515_0.555  171        9186   1.8615
gc_0.555_0.89   165        9185   1.7964

CC 
TOTAL PEAKS : 169066

Cluster         Intersect  Total  Percent
gc_0.22_0.345   250        9186   2.7215
gc_0.345_0.375  176        9186   1.9160
gc_0.375_0.4    131        9186   1.4261
gc_0.4_0.425    121        9185   1.3174
gc_0.425_0.445  99         9186   1.0777
gc_0.445_0.465  94         9186   1.0233
gc_0.465_0.485  84         9185   0.9145
gc_0.485_0.515  67         9186   0.7294
gc_0.515_0.555  67         9186   0.7294
gc_0.555_0.89   42         9185   0.4573

3hr 
TOTAL PEAKS : 194478

Cluster      

In [77]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14 iPSC
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed 
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster         Intersect  Total  Percent
gc_0.22_0.345   115        9186   1.2519
gc_0.345_0.375  72         9186   0.7838
gc_0.375_0.4    63         9186   0.6858
gc_0.4_0.425    53         9185   0.5770
gc_0.425_0.445  40         9186   0.4354
gc_0.445_0.465  43         9186   0.4681
gc_0.465_0.485  35         9185   0.3811
gc_0.485_0.515  30         9186   0.3266
gc_0.515_0.555  41         9186   0.4463
gc_0.555_0.89   40         9185   0.4355

D2 
TOTAL PEAKS : 281304

Cluster         Intersect  Total  Percent
gc_0.22_0.345   5308       9186   57.7836
gc_0.345_0.375  4043       9186   44.0126
gc_0.375_0.4    2987       9186   32.5169
gc_0.4_0.425    2147       9185   23.3751
gc_0.425_0.445  1718       9186   18.7024
gc_0.445_0.465  1430       9186   15.5672
gc_0.465_0.485  1117       9185   12.1611
gc_0.485_0.515  912        9186   9.9282
gc_0.515_0.555  703        9186   7.6530
gc_0.555_0.89   465        9185   5.0626

D4 
TOTAL PEAKS : 282608

Cluster 