# ENCODE Subset Peaks

Given a subset of peaks, find out which cells in ENCODE have highest/lowest fraction of reads in those peaks.

In [1]:
library(GenomicRanges)

In [2]:
# load encode counts matrix
cts <- read.csv("/oak/stanford/groups/akundaje/projects/atlas/counts_matrices/atlas.dnase.overlap.counts.txt",sep="\t")
rownames(cts)=paste(cts$chr,cts$start,cts$end,sep='\t')
cts$chrom = NULL
cts$start = NULL
cts$end = NULL
head(cts, 2)
dim(cts)

Unnamed: 0_level_0,ENCSR000EID,ENCSR000EIE,ENCSR000EIF,ENCSR000EIG,ENCSR000EII,ENCSR000EIL,ENCSR000EIN,ENCSR000EIV,ENCSR000EIW,ENCSR000EIY,⋯,ENCSR974TXT,ENCSR976XOY,ENCSR978QUT,ENCSR979ZJS,ENCSR986HEN,ENCSR986XLW,ENCSR988YKR,ENCSR989YIV,ENCSR990XXC,ENCSR999TSD
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1	10370	10641,28,9,17,8,49,28,7,11,20,33,⋯,2,1,0,3,2,12,0,15,3,1
chr1	10711	11232,12,9,5,9,44,39,2,7,13,27,⋯,0,1,0,1,1,2,0,9,1,0


In [3]:
encode_peaks_gr = GRanges(sub("\t", "-", sub("\t", ":", rownames(cts))))

In [4]:
head(encode_peaks_gr)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames      ranges strand
         <Rle>   <IRanges>  <Rle>
  [1]     chr1 10370-10641      *
  [2]     chr1 10711-11232      *
  [3]     chr1 11308-11533      *
  [4]     chr1 14499-14700      *
  [5]     chr1 16142-16349      *
  [6]     chr1 25958-26158      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [5]:
colnames(metadata)

NULL

In [6]:
# ENCODE metadata
metadata = read.delim("/oak/stanford/groups/akundaje/projects/atlas/dnase_experiments_metadata.tsv", 
                      sep="\t",
                     skip=1,
                     header=T,
                     fill=T)
metadata = metadata[, c("Accession", "Biosample.summary", "Biosample.term.name")]
rownames(metadata) = metadata$Accession
metadata$Accession = NULL
head(metadata)

Unnamed: 0_level_0,Biosample.summary,Biosample.term.name
Unnamed: 0_level_1,<fct>,<fct>
ENCSR728BAD,adrenal gland male embryo (108 days),adrenal gland
ENCSR724CND,foreskin keratinocyte male newborn,foreskin keratinocyte
ENCSR770DEN,fibroblast of skin of scalp male embryo (97 days),fibroblast of skin of scalp
ENCSR594OWA,small intestine male embryo (91 day),small intestine
ENCSR325LYJ,fibroblast of skin of upper back male embryo (97 days),fibroblast of skin of upper back
ENCSR257CIZ,kidney tubule cell female adult (80 years) treated with 5 μM cisplatin,kidney tubule cell


In [43]:
# ref peaks
peak_set = read.table("./beds/20200305_heatmap_bulk_n15/idx11.bed",
                      header=F, sep='')
colnames(peak_set) = c("chr", "start", "end")
peak_set = makeGRangesFromDataFrame(peak_set)
head(peak_set)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]     chr1   180686-181260      *
  [2]     chr1   181268-181597      *
  [3]     chr1   629283-630067      *
  [4]     chr1   779776-780009      *
  [5]     chr1   876480-877139      *
  [6]     chr1 1014931-1015131      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [44]:
overlaps = findOverlaps(encode_peaks_gr, peak_set)
overlaps

Hits object with 36153 hits and 0 metadata columns:
          queryHits subjectHits
          <integer>   <integer>
      [1]        47           1
      [2]        47           2
      [3]        80           3
      [4]        91           4
      [5]        92           4
      ...       ...         ...
  [36149]   2025758       35917
  [36150]   2025759       35917
  [36151]   2025805       35919
  [36152]   2025817       35920
  [36153]   2025933       35921
  -------
  queryLength: 2027128 / subjectLength: 35921

In [45]:
length(unique(queryHits(overlaps)))

In [46]:
frac_reads_in_peak_set = colSums(cts[unique(queryHits(overlaps)),])/colSums(cts)

In [47]:
df = data.frame(percentage=100*frac_reads_in_peak_set,
               metadata[names(frac_reads_in_peak_set),])
df = df[rev(order(df$percentage)), c("percentage", "Biosample.term.name")]
head(df, 100)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR000EJN,11.294326,H1
ENCSR768OLL,9.969825,right kidney
ENCSR678ILN,9.776650,ELF-1
ENCSR550UWM,9.705937,renal cortex interstitium
ENCSR543YPH,9.704760,left kidney
ENCSR000EMK,9.522283,hematopoietic multipotent progenitor cell
ENCSR261SMF,9.453133,iPS DF 6.9
ENCSR757EPJ,9.421580,kidney
ENCSR792ZXA,9.417800,kidney
ENCSR873ANE,9.416776,kidney


In [32]:
head(df[order(df$percentage), ], 20)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR000EQN,0.9136016,WI38
ENCSR251UPG,0.9643307,foreskin fibroblast
ENCSR000EQM,1.0955804,WI38
ENCSR000EMP,1.1473194,GM04504
ENCSR000EMA,1.1657149,AG09319
ENCSR000EMG,1.2124468,HS-5
ENCSR000EMO,1.2377387,GM04503
ENCSR000EOI,1.2464393,fibroblast of peridontal ligament
ENCSR000EOJ,1.2577214,fibroblast of lung
ENCSR000EMH,1.2693816,stromal cell of bone marrow
