# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [9]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200307_fine_clustering
BEDS=$ANALYSIS/beds/20200307_gridmap_naive_n15
NUM_BEDS=15
RES=$BASE/resources/

In [26]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.1000bp.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.1000bp.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [28]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        1872       35001  5.3484
2        2285       26613  8.5860
3        292        35705  0.8178
4        2412       16773  14.3803
5        663        39769  1.6671
6        838        33689  2.4875
7        98         30293  0.3235
8        330        26967  1.2237
9        1027       38812  2.6461
10       1514       24146  6.2702
11       124        27916  0.4442
12       158        42792  0.3692
13       1368       33538  4.0790
14       89         15205  0.5853
15       122        30258  0.4032


In [29]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        14425      35001  41.2131
2        15716      26613  59.0538
3        1222       35705  3.4225
4        14733      16773  87.8376
5        5719       39769  14.3805
6        5359       33689  15.9073
7        686        30293  2.2645
8        1413       26967  5.2397
9        8154       38812  21.0090
10       6429       24146  26.6255
11       1163       27916  4.1661
12       1298       42792  3.0333
13       7763       33538  23.1469
14       890        15205  5.8533
15       525        30258  1.7351


In [47]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=./55911_peaks.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 34107

Cluster  Intersect  Total  Percent
1        3881       35001  11.0883
2        4508       26613  16.9391
3        1811       35705  5.0721
4        5803       16773  34.5973
5        864        39769  2.1725
6        360        33689  1.0686
7        160        30293  0.5282
8        7197       26967  26.6882
9        2281       38812  5.8770
10       5452       24146  22.5793
11       775        27916  2.7762
12       560        42792  1.3087
13       1442       33538  4.2996
14       824        15205  5.4193
15       2439       30258  8.0607


In [30]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        11186      35001  31.9591
2        12176      26613  45.7521
3        10228      35705  28.6458
4        11250      16773  67.0721
5        5663       39769  14.2397
6        1469       33689  4.3605
7        3118       30293  10.2928
8        10162      26967  37.6831
9        9051       38812  23.3201
10       6757       24146  27.9839
11       6882       27916  24.6525
12       7156       42792  16.7228
13       3400       33538  10.1378
14       5766       15205  37.9217
15       11147      30258  36.8398


In [31]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        3334       35001  9.5254
2        2869       26613  10.7804
3        4827       35705  13.5191
4        1646       16773  9.8134
5        2157       39769  5.4238
6        585        33689  1.7365
7        1095       30293  3.6147
8        5108       26967  18.9417
9        3238       38812  8.3428
10       3080       24146  12.7557
11       2391       27916  8.5650
12       2705       42792  6.3213
13       1462       33538  4.3592
14       1938       15205  12.7458
15       4514       30258  14.9184
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1552       35001  4.4342
2        822        26613  3.0887
3        3602       35705  10.0882
4        141        16773  0.8406
5        1202       39769  3.0225
6        314        33689  0.9321
7        1545       30293  5.1002
8        4550       26967  16.8725
9        1920       38812  4.9469
10       1426       24146  5.9057
11       1894       27916  6.7

In [34]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nCC \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/CC_hg38/cromwell-executions/atac/8a3ac2dd-002d-41c5-88d6-a8c0787309dd/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        24292      35001  69.4037
2        23403      26613  87.9382
3        3631       35705  10.1694
4        16615      16773  99.0580
5        11343      39769  28.5222
6        950        33689  2.8199
7        18113      30293  59.7927
8        2446       26967  9.0703
9        20311      38812  52.3318
10       2991       24146  12.3871
11       25154      27916  90.1060
12       33108      42792  77.3696
13       2191       33538  6.5329
14       14799      15205  97.3298
15       1843       30258  6.0910

CC 
TOTAL PEAKS : 169066

Cluster  Intersect  Total  Percent
1        16724      35001  47.7815
2        20244      26613  76.0681
3        2130       35705  5.9656
4        16365      16773  97.5675
5        5864       39769  14.7452
6        390        33689  1.1576
7        12815      30293  42.3035
8        1390       26967  5.1544
9        12499      38812  32.2040
10       1354       24146  5.6076
11      

In [35]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        27073      35001  77.3492
2        25096      26613  94.2998
3        1435       35705  4.0190
4        16742      16773  99.8152
5        7591       39769  19.0877
6        478        33689  1.4189
7        12532      30293  41.3693
8        888        26967  3.2929
9        19602      38812  50.5050
10       1486       24146  6.1542
11       25960      27916  92.9933
12       29998      42792  70.1019
13       1097       33538  3.2709
14       15051      15205  98.9872
15       770        30258  2.5448

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        26528      35001  75.7921
2        25718      26613  96.6370
3        7444       35705  20.8486
4        16760      16773  99.9225
5        6741       39769  16.9504
6        506        33689  1.5020
7        2415       30293  7.9721
8        12793      26967  47.4395
9        16300      38812  41.9973
10       8991       24146  37.2360
11       