# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [1]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200307_fine_clustering
BEDS=$ANALYSIS/beds/20200330_gridmap_peakwidthnorm_n20
NUM_BEDS=20
RES=$BASE/resources/

In [2]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.1000bp.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.1000bp.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [3]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        595        21314  2.7916
2        1023       12378  8.2647
3        2175       19126  11.3720
4        1719       23985  7.1670
5        79         23803  0.3319
6        469        22042  2.1278
7        1306       24286  5.3776
8        1521       10926  13.9209
9        934        28638  3.2614
10       116        15245  0.7609
11       161        22607  0.7122
12       1074       18420  5.8306
13       112        35581  0.3148
14       64         28186  0.2271
15       400        22258  1.7971
16       650        34012  1.9111
17       68         20077  0.3387
18       190        25440  0.7469
19       202        15281  1.3219
20       334        33872  0.9861


In [4]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        3624       21314  17.0029
2        3235       12378  26.1351
3        13615      19126  71.1858
4        12679      23985  52.8622
5        411        23803  1.7267
6        2251       22042  10.2123
7        10446      24286  43.0124
8        10072      10926  92.1838
9        8041       28638  28.0781
10       1183       15245  7.7599
11       622        22607  2.7514
12       4918       18420  26.6992
13       992        35581  2.7880
14       503        28186  1.7846
15       3674       22258  16.5064
16       4996       34012  14.6889
17       585        20077  2.9138
18       830        25440  3.2626
19       884        15281  5.7850
20       1934       33872  5.7097


In [5]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        741        21314  3.4766
2        3721       12378  30.0614
3        10513      19126  54.9671
4        8684       23985  36.2060
5        9985       23803  41.9485
6        4314       22042  19.5717
7        5553       24286  22.8650
8        7763       10926  71.0507
9        3748       28638  13.0875
10       6395       15245  41.9482
11       6764       22607  29.9199
12       1707       18420  9.2671
13       6070       35581  17.0597
14       3008       28186  10.6720
15       1953       22258  8.7744
16       7717       34012  22.6891
17       5381       20077  26.8018
18       8096       25440  31.8239
19       6368       15281  41.6727
20       6930       33872  20.4594


In [6]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        292        21314  1.3700
2        1832       12378  14.8005
3        1826       19126  9.5472
4        1986       23985  8.2802
5        4095       23803  17.2037
6        2090       22042  9.4819
7        1448       24286  5.9623
8        1104       10926  10.1043
9        1240       28638  4.3299
10       2079       15245  13.6373
11       3340       22607  14.7742
12       734        18420  3.9848
13       2317       35581  6.5119
14       1092       28186  3.8743
15       797        22258  3.5807
16       2905       34012  8.5411
17       1787       20077  8.9007
18       3851       25440  15.1376
19       3299       15281  21.5889
20       2835       33872  8.3697
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        152        21314  0.7131
2        680        12378  5.4936
3        334        19126  1.7463
4        645        23985  2.6892
5        6967       23803  29.2694
6        1386       22042  6.2

In [7]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nCC \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/CC_hg38/cromwell-executions/atac/8a3ac2dd-002d-41c5-88d6-a8c0787309dd/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        292        21314  1.3700
2        1075       12378  8.6848
3        17636      19126  92.2096
4        18964      23985  79.0661
5        1430       23803  6.0076
6        1333       22042  6.0475
7        12414      24286  51.1159
8        10829      10926  99.1122
9        7355       28638  25.6827
10       14355      15245  94.1620
11       2472       22607  10.9347
12       665        18420  3.6102
13       29358      35581  82.5103
14       19667      28186  69.7758
15       2273       22258  10.2121
16       23260      34012  68.3876
17       18970      20077  94.4862
18       2090       25440  8.2154
19       3537       15281  23.1464
20       13215      33872  39.0145

CC 
TOTAL PEAKS : 169066

Cluster  Intersect  Total  Percent
1        126        21314  0.5912
2        478        12378  3.8617
3        16284      19126  85.1406
4        15253      23985  63.5939
5        773        23803  3.2475
6        

In [8]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14 iPSC
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        139        21314  0.6522
2        461        12378  3.7243
3        18527      19126  96.8681
4        21091      23985  87.9341
5        667        23803  2.8022
6        380        22042  1.7240
7        14195      24286  58.4493
8        10911      10926  99.8627
9        5723       28638  19.9839
10       14699      15245  96.4185
11       1089       22607  4.8171
12       204        18420  1.1075
13       27639      35581  77.6791
14       15123      28186  53.6543
15       1084       22258  4.8702
16       24224      34012  71.2219
17       19266      20077  95.9606
18       683        25440  2.6847
19       1562       15281  10.2218
20       8132       33872  24.0080

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        147        21314  0.6897
2        5080       12378  41.0406
3        18495      19126  96.7008
4        20777      23985  86.6250
5        6821       23803  28.6561
6        3