# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [6]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200304_transient_peaks
BEDS=$ANALYSIS/beds/20200305_heatmap_bulk_n15
NUM_BEDS=15
RES=$BASE/resources/

In [22]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [20]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        975        17800  5.4775
2        223        26351  0.8463
3        47         23204  0.2026
4        151        38885  0.3883
5        1876       32857  5.7096
6        1170       44660  2.6198
7        290        40611  0.7141
8        135        46774  0.2886
9        42         27910  0.1505
10       2346       17408  13.4766
11       778        35921  2.1659
12       487        31220  1.5599
13       257        13697  1.8763
14       378        39886  0.9477
15       801        20293  3.9472


In [23]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        3323       17800  18.6685
2        869        26351  3.2978
3        128        23204  0.5516
4        464        38885  1.1933
5        14389      32857  43.7928
6        10410      44660  23.3094
7        1804       40611  4.4421
8        983        46774  2.1016
9        277        27910  0.9925
10       14833      17408  85.2080
11       7506       35921  20.8959
12       3717       31220  11.9058
13       1731       13697  12.6378
14       3627       39886  9.0934
15       4245       20293  20.9185


In [27]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        5833       17800  32.7697
2        8512       26351  32.3024
3        5585       23204  24.0691
4        10024      38885  25.7786
5        12131      32857  36.9206
6        10042      44660  22.4854
7        6680       40611  16.4487
8        4973       46774  10.6320
9        1357       27910  4.8621
10       11066      17408  63.5685
11       3323       35921  9.2509
12       842        31220  2.6970
13       147        13697  1.0732
14       2572       39886  6.4484
15       1280       20293  6.3076


In [29]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        2778       17800  15.6067
2        4009       26351  15.2138
3        1989       23204  8.5718
4        4470       38885  11.4954
5        3193       32857  9.7179
6        3152       44660  7.0578
7        2443       40611  6.0156
8        1828       46774  3.9082
9        423        27910  1.5156
10       1658       17408  9.5244
11       1098       35921  3.0567
12       339        31220  1.0858
13       65         13697  0.4746
14       1032       39886  2.5874
15       552        20293  2.7201
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1427       17800  8.0169
2        3204       26351  12.1589
3        3425       23204  14.7604
4        4414       38885  11.3514
5        1086       32857  3.3052
6        1624       44660  3.6364
7        1859       40611  4.5776
8        1652       46774  3.5319
9        684        27910  2.4507
10       155        17408  0.8904
11       386        35921  1.074

In [30]:
# Intersect peaks in each cluster with Heterokaryon peaks at 3hr/16hr
printf "3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

3hr 
TOTAL PEAKS : 194478

Cluster  Intersect  Total  Percent
1        2660       17800  14.9438
2        2185       26351  8.2919
3        330        23204  1.4222
4        1534       38885  3.9450
5        23964      32857  72.9342
6        26529      44660  59.4021
7        26315      40611  64.7977
8        22507      46774  48.1186
9        10388      27910  37.2196
10       16104      17408  92.5092
11       5935       35921  16.5224
12       340        31220  1.0890
13       29         13697  0.2117
14       3669       39886  9.1987
15       345        20293  1.7001

16hr 
TOTAL PEAKS : 188309

Cluster  Intersect  Total  Percent
1        3881       17800  21.8034
2        1781       26351  6.7588
3        218        23204  0.9395
4        1001       38885  2.5743
5        26794      32857  81.5473
6        26978      44660  60.4075
7        23115      40611  56.9181
8        17945      46774  38.3653
9        7199       27910  25.7936
10       16998      17408  97.6448
11       

In [32]:
# Intersect peaks in each cluster with day-wise data to compare with hets 
printf "D2 \n"
REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/D2/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nD4 \n"
REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/D4/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        8993       17800  50.5225
2        11791      26351  44.7459
3        2967       23204  12.7866
4        8901       38885  22.8906
5        31195      32857  94.9417
6        31248      44660  69.9687
7        17325      40611  42.6609
8        6994       46774  14.9528
9        882        27910  3.1602
10       17394      17408  99.9196
11       4955       35921  13.7942
12       119        31220  0.3812
13       37         13697  0.2701
14       1313       39886  3.2919
15       288        20293  1.4192

D4 
TOTAL PEAKS : 282608

Cluster  Intersect  Total  Percent
1        13248      17800  74.4270
2        14537      26351  55.1668
3        856        23204  3.6890
4        4606       38885  11.8452
5        31797      32857  96.7739
6        34893      44660  78.1303
7        23168      40611  57.0486
8        8365       46774  17.8839
9        1010       27910  3.6188
10       17399      17408  99.9483
11       