Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

how well can we predict nova binding hits?

  • Loading branch information...
commit e2bbd0dde99248a3746805659ebcdf06b53b185c 1 parent c6b1b41
@lianos authored
Showing with 302 additions and 1 deletion.
  1. +7 −1 NAMESPACE
  2. +177 −0 R/ARE.utils.R
  3. +118 −0 inst/demos/predict-NOVA-HITS.R
View
8 NAMESPACE
@@ -13,6 +13,12 @@ export(
plotDecisionSurface,
makeCircle,
- spectrumFeatures
+ spectrumFeatures,
+
+ ## from ARE.utils
+ trim.data,
+ match.dim,
+ create.densities,
+ plot.densities
)
View
177 R/ARE.utils.R
@@ -0,0 +1,177 @@
+## This code is taken from my github.com/lianos/ARE.utils package
+
+trim.data <- function(x, qtile=0.01, trim.to=c("NA", "quantile", "remove"),
+ side=c('both', 'upper', 'lower')) {
+ trim.to <- match.arg(trim.to)
+ side <- match.arg(side)
+ if (side == 'both') {
+ side <- c('upper', 'lower')
+ }
+
+ qtile <- min(qtile, 1-qtile, na.rm=TRUE)
+ lo <- quantile(x, qtile, na.rm=TRUE)
+ hi <- quantile(x, 1-qtile, na.rm=TRUE)
+
+ if ('lower' %in% side) {
+ x[x < lo] <- if (trim.to == "quantile") lo else NA
+ }
+ if ('upper' %in% side) {
+ x[x > hi] <- if (trim.to == "quantile") hi else NA
+ }
+
+ if (trim.to == "remove") {
+ x <- x[!is.na(x)]
+ }
+
+ x
+}
+
+match.dim <- function(margin) {
+ # You can specify the margin by the index used to reference it, or by
+ # it's name. This function returns which dimension to "run-over":
+ # - 1 is rows
+ # - 2 is cols
+ # (This function is vectorized, tho I'm not sure why)
+ rows <- which(margin %in% c('r', 'row', 'rows', 1))
+ cols <- which(margin %in% c('c', 'col', 'cols', 'column', 'columns', 2))
+ result <- numeric(length(margin))
+ result[rows] <- 1
+ result[cols] <- 2
+ result[-c(rows,cols)] <- NA
+ result
+}
+
+create.densities <- function(..., along=1, density.params=list(), na.rm=TRUE) {
+ along <- match.dim(along)
+ stopifnot(along %in% c(1,2))
+ if (is.null(density.params$na.rm)) {
+ density.params$na.rm <- na.rm
+ }
+
+ data <- list(...)
+ if (length(data) == 1) {
+ data <- data[[1]]
+ }
+ if (is.data.frame(data)) {
+ data <- as.list(data)
+ }
+ if (is.list(data)) {
+ data <- lapply(data, function(item) {
+ if (is(item, 'density')) {
+ ans <- item
+ } else {
+ if (length(item) < 2) {
+ warning("Trying to create a density out of < 2 items")
+ ans <- NULL
+ } else {
+ ans <- do.call(density, c(list(item), density.params))
+ }
+ }
+ ans
+ })
+ } else if (is.matrix(data)) {
+ n.samples <- dim(data)[along]
+ if (along == 2) data <- t(data)
+ data <- lapply(seq(n.samples), function (idx) {
+ do.call(density, c(list(data[idx,]), density.params))
+ })
+ }
+ data
+}
+
+plot.densities <- function(..., along=2, density.params=list(),
+ plot.params=list(), legend=NULL,
+ na.rm=TRUE, main="Densities") {
+ # Plot the densities from many examples into one plot.
+ # Returns the densities used as a list.
+ #
+ # WARNING: If you don't explicitly pass in the `along` function, but rather
+ # call `plot.densiteis(data, 'cols')`, this gets hosed
+ #
+ # ... : The data to plot the densities for, this can be:
+ # : - items, to, create, densities, from
+ # : - a list of densities
+ # : - a list of numbers
+ # : - a Nx2 matrix of data, where each row/column is a
+ # : separate set of observations. Specify which dimension
+ # : to use by `along`
+ # along : The dimension that holds the data for 1 example.
+ # : 1 is rows, 2 is columns
+ # density.params : list of parameters to pass to the `density` function, if we
+ # : have to create densites on the fly. Popular values would be:
+ # : density.params=list(n=512, kernel='gaussian', na.rm=TRUE)
+ # : Note that passing nothing here will simply use the default
+ # : args
+ # plot.params : like density.params, but for the plot function, you might do
+ # : plot.params=list(col=rainbow(20), main='Some Title')
+ # : If no `col` is passed, it is autogenerated
+ data <- list(...)
+ if (all(sapply(data, is, 'density'))) {
+ data <- data
+ } else if (is.list(data[[1]]) && all(sapply(data[[1]], is, 'density'))) {
+ data <- data[[1]]
+ } else {
+ along <- match.dim(along)
+ data <- create.densities(..., along=along, na.rm=na.rm,
+ density.params=density.params)
+ }
+ stopifnot(is.list(data))
+ keep <- sapply(data, is, 'density')
+ if (!any(keep)) {
+ stop("All items passed in created empty densities")
+ }
+
+ data <- data[keep]
+
+ if (is.character(legend) && length(legend) != length(keep)) {
+ warning("Number of items in legend is different than number of data points")
+ legend <- NULL
+ } else {
+ legend <- legend[keep]
+ }
+
+ col <- rainbow(sum(keep))
+ if (is.character(plot.params$col)) {
+ if (length(col) != length(keep)) {
+ warning("Colors do not match data length, using rainbow")
+ } else {
+ col <- plot.params$col[keep]
+ }
+ }
+ plot.params$col <- col
+
+ if (is.null(plot.params$ylim)) {
+ y.max <- max(unlist(lapply(data, function (d) max(d$y))))
+ plot.params$ylim <- c(0, y.max)
+ }
+ if (is.null(plot.params$xlim)) {
+ x.min <- min(unlist(lapply(data, function (d) min(d$x))))
+ x.max <- max(unlist(lapply(data, function (d) max(d$x))))
+ plot.params$xlim <- c(x.min, x.max)
+ }
+ if (is.null(plot.params$main)) {
+ plot.params$main <- main
+ }
+ if (is.null(plot.params$ylab)) {
+ plot.params$ylab <- "Density"
+ }
+ if (is.null(plot.params$lwd)) {
+ plot.params$lwd <- 1.5
+ }
+
+ do.call(plot, c(list(data[[1]]), plot.params))
+ if (length(data) > 1) {
+ for (idx in 2:length(data)) {
+ rm.params <- c('main', 'xlab', 'ylab')
+ line.params <- plot.params[!names(plot.params) %in% rm.params]
+ line.params$col <- plot.params$col[idx]
+ do.call(lines, c(list(data[[idx]]), line.params))
+ }
+ }
+
+ if (is.character(legend)) {
+ legend('topright', legend=legend, text.col=col)
+ }
+
+ invisible(data)
+}
View
118 inst/demos/predict-NOVA-HITS.R
@@ -0,0 +1,118 @@
+library(BSgenome.Mmusculus.UCSC.mm9)
+library(GenomicRanges)
+library(data.table)
+library(ggplot2)
+library(shikken)
+
+################################################################################
+## Local setup
+if (FALSE) {
+ ag.mm9 <- readRDS('~/cBio/projects/TagSeq/inst/extdata/annotated.genome.mm9.rds')
+ ag.dm3 <- readRDS('/Users/stavros/cBio/projects/GenomicCache/GenomicCache.dm3.ensGene/cache/annotated.chromosomes/annotated.collapse-cover.up-500.down-5000.cds-cover-max.rds')
+
+ load('/Users/stavros/cBio/bioc/BiocSeqSVM/data/Dmel.DEXSeq.exons.rda')
+ load('/Users/stavros/cBio/bioc/BiocSeqSVM/data/NOVA.mm9.rda')
+ load('/Users/stavros/cBio/bioc/BiocSeqSVM/data/pasillaData.rda')
+
+ dex <- read.table('/Users/stavros/cBio/bioc/BiocSeqSVM/data/ps.diff-exons.txt',
+ stringsAsFactors=FALSE, header=TRUE, sep="\t")
+ library(GenomicCache)
+ library(SeqTools)
+ gcm <- GenomicCache("/Users/stavros/cBio/projects/GenomicCache/GenomicCache.mm9.knownGene")
+ gcd <- GenomicCache('/Users/stavros/cBio/projects/GenomicCache/GenomicCache.dm3.ensGene')
+ n1 <- GFGene("Nova1", gcm)
+ n2 <- GFGene("Nova2", gcm)
+
+ dl('seqstore')
+
+ bams <- c(wt1='/Users/stavros/cBio/bioc/data/pasilla/untreated-1/tophat_out/accepted_hits.bam',
+ wt2='/Users/stavros/cBio/bioc/data/pasilla/untreated-4/tophat_out/accepted_hits.bam',
+ ps1='/Users/stavros/cBio/bioc/data/pasilla/ps-si-1/tophat_out/accepted_hits.bam')
+ bams <- as.list(bams)
+ bams <- lapply(bams, BamFile)
+
+ ## Fly
+ ## http://www-huber.embl.de/pub/DEXSeq/psfb/testForDEU.html
+ tenm <- GFGene("Ten-m", gcd) ## exon 7: ~ 22362702
+ tranges <- ag.mm9[values(ag.mm9)$symbol == "Ten-m"]
+ br <- GFGene("br", gcd) ## exon 12 1537033 chrX [1531317, 1531367]
+}
+
+################################################################################
+## AMI setup
+if (FALSE) {
+ ag.mm9 <- readRDS('/home/steve/ml-data/annotated.genome.mm9.rds')
+ ag.dm3 <- readRDS('/home/steve/ml-data/annotated.genome.dm3.rds')
+ base <- 'http://cbio.mskcc.org/~lianos/files/bioc2012'
+ fetch <- c('Dmel.DEXSeq.exons.rda', 'NOVA.mm9.rda', 'pasillaData.rda')
+ for (get in fetch) {
+ download.file(paste(base, get, sep="/"), get)
+ load(get)
+ }
+
+ dex <- system.file("data", "ps.diff-exons.txt", package="BiocSeqSVM")
+}
+
+################################################################################
+## Use spectrum kernel to learn preferred binding landscape of NOVA in mousr
+head(nova.peaks)
+dt.mm9 <- as(ag.mm9, 'data.table')
+dt.nova <- as(nova.peaks, 'data.table')
+
+## Where does NOVA like to bind on the genome?
+nova.summary <- dt.nova[, list(score=sum(score)), by='exon.anno']
+
+gg.angle <- opts(axis.text.x=theme_text(angle=-45, hjust=0, vjust=1))
+g <- ggplot(nova.summary, aes(x=exon.anno, y=score, fill=exon.anno)) +
+ geom_bar(stat="identity") + theme_bw() + gg.angle +
+ opts(title="NOVA binding site regions in mm9")
+print(g)
+
+## Let's remove intergnic hits
+dt.nova <- subset(dt.nova, exon.anno != 'intergenic')
+dt.mm9 <- subset(dt.mm9, entrez.id %in% dt.nova$entrez.id)
+
+cbound <- subset(dt.nova, exon.anno == "cds")
+cbound <- cbound[order(cbound$score, decreasing=TRUE),]
+
+ibound <- subset(dt.nova, exon.anno == "intron")
+ibound <- ibound[order(ibound$score, decreasing=TRUE),]
+summary(ibound$width[1:500])
+summary(ibound$score[1:500])
+
+ubound <- subset(dt.nova, exon.anno == "utr3")
+ubound <- ubound[order(ubound$score, decreasing=TRUE),]
+summary(ubound$score[1:500])
+
+## I know I should be using ggplot2, but ...
+## Does binding in intron vs 3'utr look different?
+plot.densities(trim.data(cbound$width, 0.05),
+ trim.data(ibound$width, 0.05),
+ trim.data(ubound$width, 0.05),
+ legend=c('cds', 'intron', 'utr3'),
+ main="Binding widths")
+
+## How about their scores?
+plot.densities(trim.data(cbound$score, 0.05),
+ trim.data(ibound$score, 0.05),
+ trim.data(ubound$score, 0.05),
+ legend=c('cds', 'intron', 'utr3'),
+ main="Score distro")
+
+ipos <- subset(ibound, width > 20 & width < 100 & score > 10)
+cpos <- subset(cbound, width > 20 & width < 100 & score > 10)
+pos <- as(rbind(ipos, cpos), "GRanges")
+posdna <- getSeq(Mmusculus, pos)
+
+gexpr <- dt.nova$entrez.id
+ineg <-
+################################################################################
+## dm3
+dt.dm3 <- as(ag.dm3, 'data.table')
+flybase <- mget(ifelse(is.na(dt.dm3$entrez.id), 'XXX', dt.dm3$entrez.id),
+ org.Dm.egFLYBASE, ifnotfound=NA)
+dt.dm3$fb <- sapply(flybase, '[[', 1L)
+
+
+
+
Please sign in to comment.
Something went wrong with that request. Please try again.