# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [8]:
BASE=~/kundajelab/scATAC-reprog/
BED_DIR=/oak/stanford/groups/akundaje/surag/GEO/GSE36570/bpnet/
BED_NAMES=$(ls $BED_DIR/*bed | xargs -I{} basename -s ".bed" {})
RES=$BASE/resources/

In [5]:
echo $BED_NAMES

KLF4.idr MYC.idr OCT4.rep1.10k SOX2.idr


In [51]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# arguments are REF_BED
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $BED_NAMES
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a <(cat $BED_DIR/${i}.bed | awk -v OFS='\t' '{mid=int(($2+$3)/2); print $1,mid-500,mid+500}') -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $BED_DIR/${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done) | column -t
} 

In [52]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 13360

Cluster        Intersect  Total  Percent
KLF4.idr       514        10752  4.7805
MYC.idr        1222       14908  8.1969
OCT4.rep1.10k  246        10000  2.4600
SOX2.idr       223        12949  1.7221


In [43]:
# Intersect peaks in each cluster with Histone ChIP peaks from GSE62777 (Cachiarelli et al 2015)
zcat /oak/stanford/groups/akundaje/surag/GEO/GSE62777/croo/5dd_DOX_plus_H3K4me2/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 67553

Cluster        Intersect  Total  Percent
KLF4.idr       4233       10752  39.3694
MYC.idr        8942       14908  59.9812
OCT4.rep1.10k  2543       10000  25.4300
SOX2.idr       2407       12949  18.5883


In [44]:
### Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF 

TOTAL PEAKS : 149765

Cluster        Intersect  Total  Percent
KLF4.idr       2581       10752  24.0048
MYC.idr        6865       14908  46.0491
OCT4.rep1.10k  1112       10000  11.1200
SOX2.idr       671        12949  5.1819


In [45]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed

rm tmp.bed

TOTAL PEAKS : 188521

Cluster        Intersect  Total  Percent
KLF4.idr       10632      10752  98.8839
MYC.idr        12700      14908  85.1892
OCT4.rep1.10k  9772       10000  97.7200
SOX2.idr       12818      12949  98.9883


In [46]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\nCC \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/CC_hg38/cromwell-executions/atac/8a3ac2dd-002d-41c5-88d6-a8c0787309dd/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster        Intersect  Total  Percent
KLF4.idr       4021       10752  37.3977
MYC.idr        11832      14908  79.3668
OCT4.rep1.10k  2757       10000  27.5700
SOX2.idr       2596       12949  20.0479

CC 
TOTAL PEAKS : 169066

Cluster        Intersect  Total  Percent
KLF4.idr       3451       10752  32.0964
MYC.idr        10975      14908  73.6182
OCT4.rep1.10k  2177       10000  21.7700
SOX2.idr       1899       12949  14.6652

3hr 
TOTAL PEAKS : 194478

Cluster        Intersect  Total  Percent
KLF4.idr       3767       10752  35.0353
MYC.idr        10995      14908  73.7523
OCT4.rep1.10k  2556       10000  25.5600
SOX2.idr       2178       12949  16.8198

16hr 
TOTAL PEAKS : 188309

Cluster        Intersect  Total  Percent
KLF4.idr       4055       10752  37.7139
MYC.idr        11184      14908  75.0201
OCT4.rep1.10k  2407       10000  24.0700
SOX2.idr       1702       12949  13.1439

48hr 
TOTAL PEAKS : 207973

Cluster        Intersect  Total  Percen

In [47]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14 iPSC
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/idr_reproducibility/idr.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed 
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 234319

Cluster        Intersect  Total  Percent
KLF4.idr       3344       10752  31.1012
MYC.idr        11039      14908  74.0475
OCT4.rep1.10k  1740       10000  17.4000
SOX2.idr       1154       12949  8.9119

D2 
TOTAL PEAKS : 200667

Cluster        Intersect  Total  Percent
KLF4.idr       4979       10752  46.3077
MYC.idr        10622      14908  71.2503
OCT4.rep1.10k  2597       10000  25.9700
SOX2.idr       1779       12949  13.7385

D4 
TOTAL PEAKS : 211534

Cluster        Intersect  Total  Percent
KLF4.idr       4677       10752  43.4989
MYC.idr        11014      14908  73.8798
OCT4.rep1.10k  2573       10000  25.7300
SOX2.idr       1812       12949  13.9934

D6 
TOTAL PEAKS : 213146

Cluster        Intersect  Total  Percent
KLF4.idr       4719       10752  43.8895
MYC.idr        10644      14908  71.3979
OCT4.rep1.10k  2471       10000  24.7100
SOX2.idr       1437       12949  11.0974

D10 
TOTAL PEAKS : 214189

Cluster        Intersect  Total  Percent
KLF4