# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [10]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200311_heterokaryon_comparison
BEDS=$ANALYSIS/beds/20200313_heatmap_qn_D0_CC_D2_16hr_sorted_n15
NUM_BEDS=15
RES=$BASE/resources/

In [11]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [12]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        1643       38993  4.2136
2        845        27625  3.0588
3        728        25174  2.8919
4        1790       33174  5.3958
5        421        24532  1.7161
6        429        33168  1.2934
7        796        22273  3.5738
8        433        21074  2.0547
9        402        38940  1.0324
10       1921       21997  8.7330
11       673        30253  2.2246
12       108        27911  0.3869
13       131        25184  0.5202
14       1824       38195  4.7755
15       1081       28059  3.8526


In [13]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        13772      38993  35.3192
2        6511       27625  23.5692
3        5295       25174  21.0336
4        15193      33174  45.7979
5        3314       24532  13.5089
6        4335       33168  13.0698
7        3984       22273  17.8871
8        1523       21074  7.2269
9        4088       38940  10.4982
10       13419      21997  61.0038
11       6414       30253  21.2012
12       779        27911  2.7910
13       1301       25184  5.1660
14       11508      38195  30.1296
15       8006       28059  28.5327


In [14]:
# Intersect peaks in each cluster with Fibroblast JUN ChIP-seq (collected by Glenn)
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ChIP-seq/outputs/JUN/peak/overlap_reproducibility/overlap.optimal_peak.regionPeak.gz
zcat $REF  > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 18297

Cluster  Intersect  Total  Percent
1        987        38993  2.5312
2        217        27625  0.7855
3        4          25174  0.0159
4        3800       33174  11.4548
5        103        24532  0.4199
6        787        33168  2.3728
7        18         22273  0.0808
8        8          21074  0.0380
9        1949       38940  5.0051
10       4754       21997  21.6120
11       74         30253  0.2446
12       961        27911  3.4431
13       5555       25184  22.0577
14       1          38195  0.0026
15       4          28059  0.0143


In [15]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        9203       38993  23.6017
2        5029       27625  18.2045
3        738        25174  2.9316
4        11492      33174  34.6416
5        1535       24532  6.2571
6        2278       33168  6.8681
7        8057       22273  36.1738
8        8399       21074  39.8548
9        4975       38940  12.7761
10       12242      21997  55.6530
11       1366       30253  4.5153
12       1224       27911  4.3854
13       4177       25184  16.5859
14       521        38195  1.3641
15       931        28059  3.3180


In [16]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        2596       38993  6.6576
2        1567       27625  5.6724
3        279        25174  1.1083
4        2728       33174  8.2233
5        493        24532  2.0096
6        768        33168  2.3155
7        3854       22273  17.3035
8        3924       21074  18.6201
9        1853       38940  4.7586
10       2376       21997  10.8015
11       553        30253  1.8279
12       340        27911  1.2182
13       1236       25184  4.9079
14       215        38195  0.5629
15       416        28059  1.4826
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1209       38993  3.1006
2        797        27625  2.8851
3        138        25174  0.5482
4        1090       33174  3.2857
5        337        24532  1.3737
6        665        33168  2.0049
7        2282       22273  10.2456
8        3553       21074  16.8596
9        1393       38940  3.5773
10       618        21997  2.8095
11       330        30253  1.0908

In [17]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        30492      38993  78.1987
2        12120      27625  43.8733
3        530        25174  2.1053
4        32190      33174  97.0338
5        4697       24532  19.1464
6        18971      33168  57.1967
7        1774       22273  7.9648
8        229        21074  1.0866
9        29105      38940  74.7432
10       21974      21997  99.8954
11       5721       30253  18.9105
12       19730      27911  70.6890
13       24327      25184  96.5970
14       135        38195  0.3534
15       948        28059  3.3786

3hr 
TOTAL PEAKS : 194478

Cluster  Intersect  Total  Percent
1        23305      38993  59.7671
2        6650       27625  24.0724
3        129        25174  0.5124
4        28665      33174  86.4080
5        1597       24532  6.5099
6        11617      33168  35.0247
7        2293       22273  10.2950
8        199        21074  0.9443
9        28871      38940  74.1423
10       21543      21997  97.9361
11     

In [18]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        34940      38993  89.6058
2        19694      27625  71.2905
3        916        25174  3.6387
4        32605      33174  98.2848
5        11550      24532  47.0814
6        21275      33168  64.1432
7        1657       22273  7.4395
8        80         21074  0.3796
9        20132      38940  51.7001
10       21982      21997  99.9318
11       3001       30253  9.9197
12       4415       27911  15.8181
13       21622      25184  85.8561
14       2          38195  0.0052
15       299        28059  1.0656

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        32906      38993  84.3895
2        18532      27625  67.0842
3        433        25174  1.7200
4        32159      33174  96.9404
5        5463       24532  22.2689
6        6451       33168  19.4495
7        15409      22273  69.1824
8        18402      21074  87.3209
9        9978       38940  25.6240
10       21920      21997  99.6500
11      