# ENCODE Subset Peaks

Given a subset of peaks, find out which cells in ENCODE have highest/lowest fraction of reads in those peaks.

In [29]:
library(GenomicRanges)

In [2]:
# load encode counts matrix
cts <- read.csv("/oak/stanford/groups/akundaje/projects/atlas/counts_matrices/atlas.dnase.overlap.counts.txt",sep="\t")
rownames(cts)=paste(cts$chr,cts$start,cts$end,sep='\t')
cts$chrom = NULL
cts$start = NULL
cts$end = NULL
head(cts, 2)
dim(cts)

Unnamed: 0_level_0,ENCSR000EID,ENCSR000EIE,ENCSR000EIF,ENCSR000EIG,ENCSR000EII,ENCSR000EIL,ENCSR000EIN,ENCSR000EIV,ENCSR000EIW,ENCSR000EIY,⋯,ENCSR974TXT,ENCSR976XOY,ENCSR978QUT,ENCSR979ZJS,ENCSR986HEN,ENCSR986XLW,ENCSR988YKR,ENCSR989YIV,ENCSR990XXC,ENCSR999TSD
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1	10370	10641,28,9,17,8,49,28,7,11,20,33,⋯,2,1,0,3,2,12,0,15,3,1
chr1	10711	11232,12,9,5,9,44,39,2,7,13,27,⋯,0,1,0,1,1,2,0,9,1,0


In [37]:
encode_peaks_gr = GRanges(sub("\t", "-", sub("\t", ":", rownames(cts))))

In [38]:
head(encode_peaks_gr)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames      ranges strand
         <Rle>   <IRanges>  <Rle>
  [1]     chr1 10370-10641      *
  [2]     chr1 10711-11232      *
  [3]     chr1 11308-11533      *
  [4]     chr1 14499-14700      *
  [5]     chr1 16142-16349      *
  [6]     chr1 25958-26158      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [48]:
colnames(metadata)

In [83]:
# ENCODE metadata
metadata = read.delim("/oak/stanford/groups/akundaje/projects/atlas/dnase_experiments_metadata.tsv", 
                      sep="\t",
                     skip=1,
                     header=T,
                     fill=T)
metadata = metadata[, c("Accession", "Biosample.summary", "Biosample.term.name")]
rownames(metadata) = metadata$Accession
metadata$Accession = NULL
head(metadata)

Unnamed: 0_level_0,Biosample.summary,Biosample.term.name
Unnamed: 0_level_1,<fct>,<fct>
ENCSR728BAD,adrenal gland male embryo (108 days),adrenal gland
ENCSR724CND,foreskin keratinocyte male newborn,foreskin keratinocyte
ENCSR770DEN,fibroblast of skin of scalp male embryo (97 days),fibroblast of skin of scalp
ENCSR594OWA,small intestine male embryo (91 day),small intestine
ENCSR325LYJ,fibroblast of skin of upper back male embryo (97 days),fibroblast of skin of upper back
ENCSR257CIZ,kidney tubule cell female adult (80 years) treated with 5 μM cisplatin,kidney tubule cell


In [144]:
# ref peaks
peak_set = read.table("./beds/20200227_heatmap_ecto_9_vs_repro_3_11_naive_n10/idx6.bed",
                      header=F, sep='')
colnames(peak_set) = c("chr", "start", "end")
peak_set = makeGRangesFromDataFrame(peak_set)
head(peak_set)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]     chr1   832405-832718      *
  [2]     chr1 1928279-1928479      *
  [3]     chr1 1969679-1969886      *
  [4]     chr1 2118601-2119174      *
  [5]     chr1 2261662-2261887      *
  [6]     chr1 2262080-2262540      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [145]:
overlaps = findOverlaps(encode_peaks_gr, peak_set)
overlaps

Hits object with 21804 hits and 0 metadata columns:
          queryHits subjectHits
          <integer>   <integer>
      [1]       134           1
      [2]       986           2
      [3]      1009           3
      [4]      1135           4
      [5]      1136           4
      ...       ...         ...
  [21800]   2024147       30753
  [21801]   2024354       30760
  [21802]   2024500       30762
  [21803]   2024799       30764
  [21804]   2025837       30771
  -------
  queryLength: 2027128 / subjectLength: 30772

In [146]:
length(unique(queryHits(overlaps)))

In [147]:
frac_reads_in_peak_set = colSums(cts[unique(queryHits(overlaps)),])/colSums(cts)

In [148]:
df = data.frame(percentage=100*frac_reads_in_peak_set,
               metadata[names(frac_reads_in_peak_set),])
df = df[rev(order(df$percentage)), c("percentage", "Biosample.term.name")]
head(df, 100)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR620QNS,1.540941,HAP-1
ENCSR318PRQ,1.472348,middle frontal gyrus
ENCSR706IDL,1.468569,midbrain
ENCSR420NOA,1.466536,hematopoietic multipotent progenitor cell
ENCSR015BGH,1.446106,caudate nucleus
ENCSR434OBM,1.421162,foreskin melanocyte
ENCSR224IYD,1.401550,medulla oblongata
ENCSR518JGY,1.399132,foreskin melanocyte
ENCSR008SDL,1.396558,SK-MEL-5
ENCSR859CZM,1.386740,occipital lobe


In [149]:
head(df[order(df$percentage), ], 20)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR634YVQ,0.7058698,HK-2
ENCSR954AJK,0.7681722,Peyer's patch
ENCSR000ELJ,0.773598,osteoblast
ENCSR000EID,0.7748222,8988T
ENCSR000EMI,0.7874387,Caco-2
ENCSR000EJN,0.7893315,H1
ENCSR000EMK,0.7956902,hematopoietic multipotent progenitor cell
ENCSR251UPG,0.8132079,foreskin fibroblast
ENCSR000EIG,0.8172192,T-helper 1 cell
ENCSR431UEM,0.8241552,left leg bone
