# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [4]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200307_fine_clustering
BEDS=$ANALYSIS/beds/20200307_gridmap_naive_n15
NUM_BEDS=15
RES=$BASE/resources/

In [5]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [6]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        1256       35001  3.5885
2        1844       26613  6.9289
3        171        35705  0.4789
4        2314       16773  13.7960
5        368        39769  0.9253
6        569        33689  1.6890
7        44         30293  0.1452
8        245        26967  0.9085
9        612        38812  1.5768
10       1323       24146  5.4792
11       68         27916  0.2436
12       61         42792  0.1426
13       958        33538  2.8565
14       59         15205  0.3880
15       64         30258  0.2115


In [7]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        10839      35001  30.9677
2        14192      26613  53.3273
3        634        35705  1.7757
4        14604      16773  87.0685
5        3621       39769  9.1051
6        4083       33689  12.1197
7        304        30293  1.0035
8        946        26967  3.5080
9        5283       38812  13.6118
10       5660       24146  23.4407
11       589        27916  2.1099
12       527        42792  1.2315
13       6203       33538  18.4954
14       632        15205  4.1565
15       189        30258  0.6246


In [8]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        7638       35001  21.8222
2        9638       26613  36.2154
3        7567       35705  21.1931
4        10702      16773  63.8049
5        3176       39769  7.9861
6        643        33689  1.9086
7        1485       30293  4.9021
8        8514       26967  31.5719
9        5740       38812  14.7892
10       5662       24146  23.4490
11       4574       27916  16.3849
12       3967       42792  9.2704
13       1895       33538  5.6503
14       5010       15205  32.9497
15       8156       30258  26.9549


In [9]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed


REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        2450       35001  6.9998
2        2352       26613  8.8378
3        3467       35705  9.7101
4        1551       16773  9.2470
5        1199       39769  3.0149
6        270        33689  0.8014
7        471        30293  1.5548
8        4140       26967  15.3521
9        2171       38812  5.5936
10       2577       24146  10.6726
11       1536       27916  5.5022
12       1370       42792  3.2015
13       792        33538  2.3615
14       1624       15205  10.6807
15       3059       30258  10.1097
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1099       35001  3.1399
2        663        26613  2.4913
3        2441       35705  6.8366
4        125        16773  0.7452
5        564        39769  1.4182
6        85         33689  0.2523
7        715        30293  2.3603
8        3651       26967  13.5388
9        1199       38812  3.0893
10       1140       24146  4.7213
11       1256       27916  4.4992

In [15]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        20223      35001  57.7783
2        22450      26613  84.3573
3        1537       35705  4.3047
4        16593      16773  98.9268
5        6767       39769  17.0158
6        252        33689  0.7480
7        15668      30293  51.7215
8        1142       26967  4.2348
9        15104      38812  38.9158
10       1707       24146  7.0695
11       23827      27916  85.3525
12       29552      42792  69.0596
13       699        33538  2.0842
14       14605      15205  96.0539
15       570        30258  1.8838
3hr 
TOTAL PEAKS : 194478

Cluster  Intersect  Total  Percent
1        15988      35001  45.6787
2        18282      26613  68.6958
3        1560       35705  4.3691
4        15564      16773  92.7920
5        4873       39769  12.2533
6        131        33689  0.3889
7        10459      30293  34.5261
8        1375       26967  5.0988
9        11127      38812  28.6690
10       2065       24146  8.5521
11       2

In [17]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        23699      35001  67.7095
2        24673      26613  92.7103
3        185        35705  0.5181
4        16742      16773  99.8152
5        3183       39769  8.0037
6        103        33689  0.3057
7        10281      30293  33.9385
8        102        26967  0.3782
9        14597      38812  37.6095
10       382        24146  1.5820
11       25154      27916  90.1060
12       27037      42792  63.1824
13       90         33538  0.2684
14       14982      15205  98.5334
15       157        30258  0.5189

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        22499      35001  64.2810
2        25157      26613  94.5290
3        5885       35705  16.4823
4        16763      16773  99.9404
5        2947       39769  7.4103
6        115        33689  0.3414
7        943        30293  3.1129
8        11665      26967  43.2566
9        11140      38812  28.7025
10       7629       24146  31.5953
11       13