# Annotate Peaks with Other Peaks

Perform intersections of peak sets with other peaks sets.

**TODO**: Organize reference beds used and keep a copy of them in `resources`.

In [1]:
BASE=~/kundajelab/scATAC-reprog/
ANALYSIS=$BASE/src/analysis/20200319_cell_state_dissection
BEDS=$ANALYSIS/beds/20200328_tmp_n20
NUM_BEDS=20
RES=$BASE/resources/

In [2]:
# function definition that performs intersection of each cluster peak set 
# with reference peak set

# expects reference peaks to be in ./tmp.bed
# arguments are REF_BED, NUM_BEDS, BEDS directory (which should have idx1.bed,...idx{NUM_BEDS}.bed)
intersection_stats () {
    TOT=$(cat $1 | wc -l)
    printf "TOTAL PEAKS : $TOT\n\n"

    (printf "Cluster Intersect Total Percent\n" ;
    for i in $(seq 1 $2)
     do INT=$(bedtools intersect -b <(cat $1 |  cut -f1-3) \
       -a $3/idx${i}.1000bp.bed -c \
       | awk '$4>0{s+=1} END{print s}') ; 
       TOT=$(cat  $3/idx${i}.1000bp.bed | wc -l ) ; 
       FRAC=$(echo "100*$INT/$TOT" | bc -l | awk '{printf "%.4f\n", $0}') ; 
       printf "$i $INT $TOT $FRAC\n" 
    done)  | column -t
}

In [59]:
REF=../20200307_fine_clustering/beds/20200307_gridmap_naive_n15/idx3.1000bp.bed
intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 35705

Cluster  Intersect  Total  Percent
1        516        34092  1.5136
2        449        19069  2.3546
3        142        23336  0.6085
4        4961       29829  16.6315
5        414        25973  1.5940
6        719        12165  5.9104
7        9          12659  0.0711
8        519        9759   5.3182
9        6497       27171  23.9115
10       7991       24117  33.1343
11       1542       22395  6.8855
12       1127       31960  3.5263
13       2497       14540  17.1733
14       7258       23151  31.3507
15       7558       27353  27.6313
16       5201       36492  14.2524
17       1300       18391  7.0687
18       229        24005  0.9540
19       424        12382  3.4243
20       512        28638  1.7878


In [5]:
REF=../20200331_gc_stratify/beds/20200403_D0_lt_0.2_quantile_gc10/all_peaks.bed 
intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 91857

(standard_in) 1: syntax error
Cluster  Intersect  Total  Percent
1        38         34092  0.1115
2        164        19069  0.8600
3        22         23336  0.0943
4        7328       29829  24.5667
5        69         25973  0.2657
6        4142       12165  34.0485
7        12659
8        3405       9759   34.8909
9        8027       27171  29.5425
10       7782       24117  32.2677
11       840        22395  3.7508
12       391        31960  1.2234
13       5626       14540  38.6933
14       9409       23151  40.6419
15       5299       27353  19.3726
16       2469       36492  6.7659
17       228        18391  1.2397
18       8          24005  0.0333
19       5676       12382  45.8407
20       129        28638  0.4505


In [60]:
# Intersect peaks in each cluster with H1-hESC ChIP peaks from GSE17917 (Lister et al 2009)
cat $RES/GSE17917/KLF4.hg38.bed $RES/GSE17917/SOX2.hg38.bed $RES/GSE17917/OCT4.hg38.bed > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 13360

Cluster  Intersect  Total  Percent
1        549        34092  1.6103
2        128        19069  0.6712
3        1864       23336  7.9877
4        638        29829  2.1389
5        170        25973  0.6545
6        511        12165  4.2006
7        1924       12659  15.1987
8        220        9759   2.2543
9        639        27171  2.3518
10       536        24117  2.2225
11       623        22395  2.7819
12       778        31960  2.4343
13       384        14540  2.6410
14       289        23151  1.2483
15       261        27353  0.9542
16       722        36492  1.9785
17       254        18391  1.3811
18       1017       24005  4.2366
19       377        12382  3.0447
20       1308       28638  4.5674


In [61]:
# Intersect peaks in each cluster with H1-hESC ATAC peaks
REF=~/kundajelab/heterokaryon-v2/src/analysis/20191002_surag_heterokaryon_clustering/data/H1.hESC.atac.idr.peaks.liftover.to.hg38.bed

intersection_stats $REF $NUM_BEDS $BEDS

TOTAL PEAKS : 149765

Cluster  Intersect  Total  Percent
1        4570       34092  13.4049
2        1108       19069  5.8105
3        13043      23336  55.8922
4        3972       29829  13.3159
5        1523       25973  5.8638
6        1589       12165  13.0621
7        11225      12659  88.6721
8        1337       9759   13.7002
9        2753       27171  10.1321
10       2728       24117  11.3115
11       4215       22395  18.8212
12       6059       31960  18.9581
13       2177       14540  14.9725
14       1230       23151  5.3129
15       1102       27353  4.0288
16       5018       36492  13.7510
17       2198       18391  11.9515
18       7532       24005  31.3768
19       2106       12382  17.0086
20       10010      28638  34.9536


In [50]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)

REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF > tmp.bed

intersection_stats tmp.bed $NUM_BEDS $BEDS

rm tmp.bed

TOTAL PEAKS : 188521

Cluster  Intersect  Total  Percent
1        6468       17800  36.3371
2        10145      26351  38.4995
3        7984       23204  34.4079
4        13028      38885  33.5039
5        14838      32857  45.1593
6        14257      44660  31.9234
7        10274      40611  25.2986
8        8653       46774  18.4996
9        2915       27910  10.4443
10       11619      17408  66.7452
11       6045       35921  16.8286
12       1892       31220  6.0602
13       358        13697  2.6137
14       4788       39886  12.0042
15       2147       20293  10.5800


In [51]:
# Intersect peaks in each cluster with Fibroblast D2 OSKM ChIP from GSE36570 (Soufi et al 2012)
# separately for O, S, K, M

printf "OCT4\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep OCT > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nSOX2\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep SOX > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nKLF4\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep KLF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nMYC\n"
REF=$RES/GSE36570/All_48hrs_MTFBRs.hg38.bed.gz
zcat $REF | grep MYC > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

OCT4
TOTAL PEAKS : 58148

Cluster  Intersect  Total  Percent
1        3117       17800  17.5112
2        4937       26351  18.7355
3        3113       23204  13.4158
4        6076       38885  15.6256
5        3805       32857  11.5805
6        4314       44660  9.6597
7        3744       40611  9.2192
8        3377       46774  7.2198
9        1024       27910  3.6689
10       1757       17408  10.0931
11       1918       35921  5.3395
12       775        31220  2.4824
13       137        13697  1.0002
14       1908       39886  4.7836
15       947        20293  4.6666

SOX2
TOTAL PEAKS : 64527

Cluster  Intersect  Total  Percent
1        1658       17800  9.3146
2        3972       26351  15.0734
3        5080       23204  21.8928
4        6015       38885  15.4687
5        1314       32857  3.9991
6        2276       44660  5.0963
7        2839       40611  6.9907
8        3053       46774  6.5271
9        1481       27910  5.3063
10       171        17408  0.9823
11       840      

In [52]:
# Intersect peaks in each cluster with Heterokaryon peaks at MRC5/3hr/16hr
printf "MRC5 \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/MRC5_hg38/cromwell-executions/atac/15e37be8-c0e9-4b85-ba0e-8b5c54103fcc/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\nCC \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/CC_hg38/cromwell-executions/atac/8a3ac2dd-002d-41c5-88d6-a8c0787309dd/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n3hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/3hr_hg38/cromwell-executions/atac/47748a96-e66e-4f7a-b76f-6d2505e8e879/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n16hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/16hr_hg38/cromwell-executions/atac/171d7ae4-52db-4f56-8d6e-2270f3a6f827/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

printf "\n48hr \n"
REF=/oak/stanford/groups/akundaje/projects/heterokaryon/ATAC-Seq/Batch1/outputs/48hr_hg38/cromwell-executions/atac/38d68228-cc1c-44fb-b771-6d33128a7f91/call-reproducibility_overlap/execution/optimal_peak.narrowPeak.gz
zcat $REF > tmp.bed
intersection_stats tmp.bed $NUM_BEDS $BEDS
rm tmp.bed

MRC5 
TOTAL PEAKS : 250320

Cluster  Intersect  Total  Percent
1        2368       17800  13.3034
2        2875       26351  10.9104
3        1341       23204  5.7792
4        3447       38885  8.8646
5        29114      32857  88.6082
6        35426      44660  79.3238
7        33169      40611  81.6749
8        32411      46774  69.2928
9        17835      27910  63.9018
10       17210      17408  98.8626
11       14025      35921  39.0440
12       1749       31220  5.6022
13       162        13697  1.1827
14       8977       39886  22.5066
15       1081       20293  5.3270

CC 
TOTAL PEAKS : 169066

Cluster  Intersect  Total  Percent
1        1353       17800  7.6011
2        1844       26351  6.9978
3        798        23204  3.4391
4        2170       38885  5.5806
5        25768      32857  78.4247
6        27804      44660  62.2571
7        27734      40611  68.2918
8        25014      46774  53.4784
9        12702      27910  45.5106
10       16931      17408  97.2599
11       

In [53]:
# Intersect peaks in each cluster with day-wise data to compare with hets 

for x in D0 D2 D4 D6 D10 D12 D14
    do 
        printf "\n$x \n"
        REF=/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bulk/croo/$x/peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz
        zcat $REF > tmp.bed
        intersection_stats tmp.bed $NUM_BEDS $BEDS
        rm tmp.bed
    done


D0 
TOTAL PEAKS : 286556

Cluster  Intersect  Total  Percent
1        738        17800  4.1461
2        943        26351  3.5786
3        706        23204  3.0426
4        1211       38885  3.1143
5        30777      32857  93.6695
6        38700      44660  86.6547
7        34551      40611  85.0779
8        29521      46774  63.1141
9        13358      27910  47.8610
10       17345      17408  99.6381
11       12286      35921  34.2028
12       652        31220  2.0884
13       101        13697  0.7374
14       4532       39886  11.3624
15       378        20293  1.8627

D2 
TOTAL PEAKS : 281304

Cluster  Intersect  Total  Percent
1        9577       17800  53.8034
2        12918      26351  49.0228
3        3691       23204  15.9067
4        10474      38885  26.9358
5        31747      32857  96.6217
6        35411      44660  79.2902
7        21451      40611  52.8207
8        11885      46774  25.4094
9        2406       27910  8.6206
10       17393      17408  99.9138
11       