# ENCODE Subset Peaks

Given a subset of peaks, find out which cells in ENCODE have highest/lowest fraction of reads in those peaks.

In [1]:
library(GenomicRanges)

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min


Loading required packag

In [4]:
# load encode counts matrix
cts <- read.csv("/oak/stanford/groups/akundaje/projects/atlas/counts_matrices/atlas.dnase.overlap.counts.txt",sep="\t")
rownames(cts)=paste(cts$chr,cts$start,cts$end,sep='\t')
cts$chrom = NULL
cts$start = NULL
cts$end = NULL
head(cts, 2)
dim(cts)

Unnamed: 0_level_0,ENCSR000EID,ENCSR000EIE,ENCSR000EIF,ENCSR000EIG,ENCSR000EII,ENCSR000EIL,ENCSR000EIN,ENCSR000EIV,ENCSR000EIW,ENCSR000EIY,⋯,ENCSR974TXT,ENCSR976XOY,ENCSR978QUT,ENCSR979ZJS,ENCSR986HEN,ENCSR986XLW,ENCSR988YKR,ENCSR989YIV,ENCSR990XXC,ENCSR999TSD
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1	10370	10641,28,9,17,8,49,28,7,11,20,33,⋯,2,1,0,3,2,12,0,15,3,1
chr1	10711	11232,12,9,5,9,44,39,2,7,13,27,⋯,0,1,0,1,1,2,0,9,1,0


In [5]:
encode_peaks_gr = GRanges(sub("\t", "-", sub("\t", ":", rownames(cts))))

In [6]:
head(encode_peaks_gr)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames      ranges strand
         <Rle>   <IRanges>  <Rle>
  [1]     chr1 10370-10641      *
  [2]     chr1 10711-11232      *
  [3]     chr1 11308-11533      *
  [4]     chr1 14499-14700      *
  [5]     chr1 16142-16349      *
  [6]     chr1 25958-26158      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [7]:
colnames(metadata)

NULL

In [8]:
# ENCODE metadata
metadata = read.delim("/oak/stanford/groups/akundaje/projects/atlas/dnase_experiments_metadata.tsv", 
                      sep="\t",
                     skip=1,
                     header=T,
                     fill=T)
metadata = metadata[, c("Accession", "Biosample.summary", "Biosample.term.name")]
rownames(metadata) = metadata$Accession
metadata$Accession = NULL
head(metadata)

Unnamed: 0_level_0,Biosample.summary,Biosample.term.name
Unnamed: 0_level_1,<fct>,<fct>
ENCSR728BAD,adrenal gland male embryo (108 days),adrenal gland
ENCSR724CND,foreskin keratinocyte male newborn,foreskin keratinocyte
ENCSR770DEN,fibroblast of skin of scalp male embryo (97 days),fibroblast of skin of scalp
ENCSR594OWA,small intestine male embryo (91 day),small intestine
ENCSR325LYJ,fibroblast of skin of upper back male embryo (97 days),fibroblast of skin of upper back
ENCSR257CIZ,kidney tubule cell female adult (80 years) treated with 5 μM cisplatin,kidney tubule cell


In [10]:
# ref peaks
peak_set = read.table("./beds/20200307_gridmap_naive_n15/agg.idx3.8.15.1000bp.bed",
                      header=F, sep='')
colnames(peak_set) = c("chr", "start", "end")
peak_set = makeGRangesFromDataFrame(peak_set)
head(peak_set)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]     chr1   842500-843500      *
  [2]     chr1   981625-982625      *
  [3]     chr1 1450258-1451258      *
  [4]     chr1 1450475-1451475      *
  [5]     chr1 1453062-1454062      *
  [6]     chr1 1489562-1490562      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [11]:
overlaps = findOverlaps(encode_peaks_gr, peak_set)
overlaps

Hits object with 125905 hits and 0 metadata columns:
           queryHits subjectHits
           <integer>   <integer>
       [1]        80       62673
       [2]       109       62674
       [3]       134       62675
       [4]       145           1
       [5]       242           2
       ...       ...         ...
  [125901]   2025822       35704
  [125902]   2025822       62671
  [125903]   2025837       35705
  [125904]   2026008       62672
  [125905]   2026009       62672
  -------
  queryLength: 2027128 / subjectLength: 92930

In [12]:
length(unique(queryHits(overlaps)))

In [13]:
frac_reads_in_peak_set = colSums(cts[unique(queryHits(overlaps)),])/colSums(cts)

In [14]:
df = data.frame(percentage=100*frac_reads_in_peak_set,
               metadata[names(frac_reads_in_peak_set),])
df = df[rev(order(df$percentage)), c("percentage", "Biosample.term.name")]
head(df, 100)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR620QNS,8.186784,HAP-1
ENCSR420NOA,6.722068,hematopoietic multipotent progenitor cell
ENCSR318PRQ,6.679637,middle frontal gyrus
ENCSR706IDL,6.593538,midbrain
ENCSR015BGH,6.578093,caudate nucleus
ENCSR000EPX,6.461754,bronchial epithelial cell
ENCSR224IYD,6.438872,medulla oblongata
ENCSR859CZM,6.403644,occipital lobe
ENCSR937UWI,6.394680,hematopoietic multipotent progenitor cell
ENCSR000EPY,6.358615,SK-N-MC


In [48]:
head(df[order(df$percentage), ], 20)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR000EQN,1.71077,WI38
ENCSR000EQM,1.864732,WI38
ENCSR562ACY,1.920067,fibroblast of skin of left biceps
ENCSR000EMG,1.932068,HS-5
ENCSR000EOI,1.935005,fibroblast of peridontal ligament
ENCSR000EMH,1.93955,stromal cell of bone marrow
ENCSR000EMA,1.945034,AG09319
ENCSR000ELY,1.950024,AG04450
ENCSR555TFE,1.957594,fibroblast of skin of left quadriceps
ENCSR251UPG,1.959053,foreskin fibroblast
