# ENCODE Subset Peaks

Given a subset of peaks, find out which cells in ENCODE have highest/lowest fraction of reads in those peaks.

In [None]:
library(GenomicRanges)

In [2]:
# load encode counts matrix
cts <- read.csv("/oak/stanford/groups/akundaje/projects/atlas/counts_matrices/atlas.dnase.overlap.counts.txt",sep="\t")
rownames(cts)=paste(cts$chr,cts$start,cts$end,sep='\t')
cts$chrom = NULL
cts$start = NULL
cts$end = NULL
head(cts, 2)
dim(cts)

Unnamed: 0_level_0,ENCSR000EID,ENCSR000EIE,ENCSR000EIF,ENCSR000EIG,ENCSR000EII,ENCSR000EIL,ENCSR000EIN,ENCSR000EIV,ENCSR000EIW,ENCSR000EIY,⋯,ENCSR974TXT,ENCSR976XOY,ENCSR978QUT,ENCSR979ZJS,ENCSR986HEN,ENCSR986XLW,ENCSR988YKR,ENCSR989YIV,ENCSR990XXC,ENCSR999TSD
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1	10370	10641,28,9,17,8,49,28,7,11,20,33,⋯,2,1,0,3,2,12,0,15,3,1
chr1	10711	11232,12,9,5,9,44,39,2,7,13,27,⋯,0,1,0,1,1,2,0,9,1,0


In [3]:
encode_peaks_gr = GRanges(sub("\t", "-", sub("\t", ":", rownames(cts))))

In [4]:
head(encode_peaks_gr)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames      ranges strand
         <Rle>   <IRanges>  <Rle>
  [1]     chr1 10370-10641      *
  [2]     chr1 10711-11232      *
  [3]     chr1 11308-11533      *
  [4]     chr1 14499-14700      *
  [5]     chr1 16142-16349      *
  [6]     chr1 25958-26158      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [5]:
colnames(metadata)

NULL

In [6]:
# ENCODE metadata
metadata = read.delim("/oak/stanford/groups/akundaje/projects/atlas/dnase_experiments_metadata.tsv", 
                      sep="\t",
                     skip=1,
                     header=T,
                     fill=T)
metadata = metadata[, c("Accession", "Biosample.summary", "Biosample.term.name")]
rownames(metadata) = metadata$Accession
metadata$Accession = NULL
head(metadata)

Unnamed: 0_level_0,Biosample.summary,Biosample.term.name
Unnamed: 0_level_1,<fct>,<fct>
ENCSR728BAD,adrenal gland male embryo (108 days),adrenal gland
ENCSR724CND,foreskin keratinocyte male newborn,foreskin keratinocyte
ENCSR770DEN,fibroblast of skin of scalp male embryo (97 days),fibroblast of skin of scalp
ENCSR594OWA,small intestine male embryo (91 day),small intestine
ENCSR325LYJ,fibroblast of skin of upper back male embryo (97 days),fibroblast of skin of upper back
ENCSR257CIZ,kidney tubule cell female adult (80 years) treated with 5 μM cisplatin,kidney tubule cell


In [43]:
# ref peaks
peak_set = read.table("./beds/20200307_gridmap_naive_n15/idx10.bed",
                      header=F, sep='')
colnames(peak_set) = c("chr", "start", "end")
peak_set = makeGRangesFromDataFrame(peak_set)
head(peak_set)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]     chr1   629283-630067      *
  [2]     chr1 1137677-1138147      *
  [3]     chr1 1149961-1150429      *
  [4]     chr1 1167429-1167630      *
  [5]     chr1 1256339-1256738      *
  [6]     chr1 1300526-1300726      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [44]:
overlaps = findOverlaps(encode_peaks_gr, peak_set)
overlaps

Hits object with 25274 hits and 0 metadata columns:
          queryHits subjectHits
          <integer>   <integer>
      [1]        80           1
      [2]       354           2
      [3]       362           3
      [4]       373           4
      [5]       475           6
      ...       ...         ...
  [25270]   2025075       24142
  [25271]   2025773       24143
  [25272]   2025794       24144
  [25273]   2025822       24145
  [25274]   2025824       24146
  -------
  queryLength: 2027128 / subjectLength: 24146

In [45]:
length(unique(queryHits(overlaps)))

In [46]:
frac_reads_in_peak_set = colSums(cts[unique(queryHits(overlaps)),])/colSums(cts)

In [47]:
df = data.frame(percentage=100*frac_reads_in_peak_set,
               metadata[names(frac_reads_in_peak_set),])
df = df[rev(order(df$percentage)), c("percentage", "Biosample.term.name")]
head(df, 100)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR014FPY,5.203115,iPS DF 4.7
ENCSR440QMR,5.157532,iPS DF 19.7
ENCSR261SMF,5.063703,iPS DF 6.9
ENCSR000EMZ,4.932408,H7
ENCSR678ILN,4.894001,ELF-1
ENCSR000EMU,4.837074,H1
ENCSR000EPS,4.791350,NT2/D1
ENCSR620QNS,4.713085,HAP-1
ENCSR383SNM,4.624757,iPS DF 19.11
ENCSR000EJN,4.424979,H1


In [48]:
head(df[order(df$percentage), ], 20)

Unnamed: 0_level_0,percentage,Biosample.term.name
Unnamed: 0_level_1,<dbl>,<fct>
ENCSR000EQN,1.71077,WI38
ENCSR000EQM,1.864732,WI38
ENCSR562ACY,1.920067,fibroblast of skin of left biceps
ENCSR000EMG,1.932068,HS-5
ENCSR000EOI,1.935005,fibroblast of peridontal ligament
ENCSR000EMH,1.93955,stromal cell of bone marrow
ENCSR000EMA,1.945034,AG09319
ENCSR000ELY,1.950024,AG04450
ENCSR555TFE,1.957594,fibroblast of skin of left quadriceps
ENCSR251UPG,1.959053,foreskin fibroblast
