# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [1]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200311_heterokaryon_comparison
BEDS=$ANALYSIS/beds/20200409_heatmap_peakwidthnorm_qn_batch1_D0_CC_D2_16hr_sorted_n15
NUM_BEDS=15
RES=$BASE/resources/

In [2]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [3]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        814        34335  2.3708
2        676        30583  2.2104
3        689        29627  2.3256
4        2165       34803  6.2207
5        1441       51208  2.8140
6        458        22881  2.0017
7        840        32350  2.5966
8        1817       26117  6.9572
9        458        35716  1.2823
10       442        40567  1.0896
11       160        37854  0.4227
12       187        42631  0.4386
13       213        33960  0.6272
14       1607       39007  4.1198
15       648        29244  2.2158


In [4]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        6417       34335  18.6894
2        5664       30583  18.5201
3        2276       29627  7.6822
4        15636      34803  44.9272
5        11359      51208  22.1821
6        3375       22881  14.7502
7        5964       32350  18.4359
8        13857      26117  53.0574
9        3505       35716  9.8135
10       3379       40567  8.3294
11       1298       37854  3.4290
12       1551       42631  3.6382
13       2137       33960  6.2927
14       9710       39007  24.8930
15       4762       29244  16.2837


In [5]:
# Intersect peaks in each cluster with Fibroblast JUN ChIP-seq (collected by Glenn)
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ChIP-seq/outputs/JUN/peak/overlap_reproducibility/overlap.optimal_peak.regionPeak.gz
zcat $REF  > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 18297

Cluster  Intersect  Total  Percent
1        104        34335  0.3029
2        28         30583  0.0916
3        14         29627  0.0473
4        1519       34803  4.3646
5        1148       51208  2.2418
6        53         22881  0.2316
7        2          32350  0.0062
8        5996       26117  22.9582
9        93         35716  0.2604
10       443        40567  1.0920
11       548        37854  1.4477
12       2163       42631  5.0738
13       6967       33960  20.5153
14       7          39007  0.0179
15       21         29244  0.0718


In [6]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        6779       34335  19.7437
2        2167       30583  7.0856
3        13039      29627  44.0105
4        11698      34803  33.6120
5        8853       51208  17.2883
6        2766       22881  12.0886
7        1002       32350  3.0974
8        13619      26117  52.1461
9        1032       35716  2.8895
10       2793       40567  6.8849
11       1304       37854  3.4448
12       3240       42631  7.6001
13       6516       33960  19.1873
14       499        39007  1.2793
15       730        29244  2.4962


In [7]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        3010       34335  8.7666
2        916        30583  2.9951
3        6436       29627  21.7234
4        2586       34803  7.4304
5        2812       51208  5.4913
6        917        22881  4.0077
7        397        32350  1.2272
8        2795       26117  10.7018
9        368        35716  1.0304
10       971        40567  2.3936
11       354        37854  0.9352
12       1045       42631  2.4513
13       2236       33960  6.5842
14       185        39007  0.4743
15       283        29244  0.9677
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1501       34335  4.3716
2        420        30583  1.3733
3        5418       29627  18.2874
4        872        34803  2.5055
5        1589       51208  3.1030
6        467        22881  2.0410
7        245        32350  0.7573
8        774        26117  2.9636
9        439        35716  1.2291
10       1053       40567  2.5957
11       628        37854  1.6590
1

In [8]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        9595       34335  27.9452
2        3857       30583  12.6116
3        716        29627  2.4167
4        29195      34803  83.8864
5        32171      51208  62.8242
6        3204       22881  14.0029
7        2449       32350  7.5703
8        25981      26117  99.4793
9        12271      35716  34.3572
10       19496      40567  48.0588
11       21850      37854  57.7218
12       31653      42631  74.2488
13       30936      33960  91.0954
14       2002       39007  5.1324
15       4704       29244  16.0854

3hr 
TOTAL PEAKS : 194478

Cluster  Intersect  Total  Percent
1        9197       34335  26.7861
2        2029       30583  6.6344
3        742        29627  2.5045
4        24694      34803  70.9537
5        23191      51208  45.2878
6        894        22881  3.9072
7        392        32350  1.2117
8        25477      26117  97.5495
9        2948       35716  8.2540
10       8869       40567  21.8626
11     

In [11]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14 iPSC
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        5202       34335  15.1507
2        1703       30583  5.5685
3        40         29627  0.1350
4        31979      34803  91.8858
5        38167      51208  74.5333
6        10829      22881  47.3275
7        341        32350  1.0541
8        26001      26117  99.5558
9        1013       35716  2.8363
10       16263      40567  40.0892
11       1768       37854  4.6706
12       14365      42631  33.6961
13       26796      33960  78.9046
14       1          39007  0.0026
15       24         29244  0.0821

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        11416      34335  33.2489
2        2057       30583  6.7260
3        25796      29627  87.0692
4        31005      34803  89.0871
5        28960      51208  56.5537
6        9277       22881  40.5446
7        292        32350  0.9026
8        25639      26117  98.1698
9        239        35716  0.6692
10       4837       40567  11.9235
11       15