Merge remote-tracking branch 'upstream/master'

katwre · Aug 13, 2015 · 7f0cb2c · 7f0cb2c
2 parents 2a7857d + 53e94b2
commit 7f0cb2c
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 34 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: genomation
 Type: Package
 Title: Summary, annotation and visualization of genomic data
-Version: 1.1.11
+Version: 1.1.12
 Author: Altuna Akalin [aut, cre], Vedran Franke [aut, cre], Katarzyna Wreczycka [aut], Liz Ing-Simmons [ctb]	
 Maintainer: Altuna Akalin <aakalin@gmail.com>, Vedran Franke <vedran.franke@gmail.com>
 Description: A package for summary and annotation of genomic intervals. Users can visualize and 
@@ -20,30 +20,30 @@ BugReports: https://github.com/BIMSBbioinfo/genomation/issues
 Depends:
     R (>= 3.0.0),grid
 Imports:
-	GenomicRanges,
+	data.table,
+  GenomicRanges,
 	GenomicAlignments,
-	IRanges,
-	Rsamtools,
-	readr,
-  data.table,
-	plyr,
-	reshape2,
-	ggplot2,
-	methods,
-	rtracklayer,
+  ggplot2,
 	gridBase,
-	impute,
+  impute,
+  IRanges,
+  matrixStats,
+	methods,
   parallel,
   plotrix,
-  matrixStats
+  plyr,
+  readr,
+  reshape2,
+  Rsamtools,
+  rtracklayer
 Suggests:
-  	RUnit,
-	knitr,
-	RColorBrewer,
-	genomationData,
   	BiocGenerics,
-  	rmarkdown,
-  	knitrBootstrap
+    genomationData,
+    knitr,
+    knitrBootstrap,
+    RColorBrewer,
+    rmarkdown,
+    RUnit  
 Collate:
     'documentData.R'
     'genomation-classes.R'
@@ -56,4 +56,4 @@ Collate:
     'scoreMatrix.R'
     'scoreMatrixBin.R'
     'scoreMatrixList.R'
-    'test_genomation_package.R'
+    'test_genomation_package.R'
diff --git a/NEWS b/NEWS
@@ -1,3 +1,10 @@
+genomation 1.1.12
+--------------
+
+IMPROVEMENTS AND BUG FIXES
+
+* gffToGRanges parses column 9 of the gff file correctly; added ensembl=TRUE to prepend chr to seqlevels
+
 genomation 1.1.11
 --------------
 

diff --git a/R/readData.R b/R/readData.R
@@ -466,12 +466,15 @@ setMethod("readTranscriptFeatures",
 #'                 The file can end in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}
 #'                 and/or start with \code{http://} or \code{ftp://}. If the file is not compressed
 #'                 it can also start with \code{https://} or \code{ftps://}.
-#' @param track.line the number of track lines to skip, "auto" to detect them automatically
-#'                   or FALSE(default) if the bed file doesn't have track lines
+#' @param track.line Can be an integer specifying the number of track lines to skip, 
+#'                  "auto" to detect the header lines automatically
+#'                   or FALSE(default) if the bed file doesn't have track lines.
+#'                   "auto" detects both UCSC header lines and lines starting with #
 #' @param split.group boolean, whether to split the 9th column of the file
 #' @param split.char character that is used as a separator of the 9th column. ';' by default
 #' @param filter a character designating which elements to retain from the gff file (e.g. exon, CDS, ...)
 #' @param zero.based \code{boolean} whether the coordinates are 0 or 1 based. 0 is the default
+#' @param ensembl \code{boolean} if TRUE, add the chr prefix to seqlevels. FALSE by default
 #' @return returns a \code{GenomicRanges} object
 #' 
 #' @examples
@@ -482,7 +485,7 @@ setMethod("readTranscriptFeatures",
 #' @docType methods
 #' @export
 gffToGRanges = function(gff.file, track.line=FALSE, split.group=FALSE, split.char=';',filter=NULL, 
-                        zero.based=FALSE){
+                        zero.based=FALSE, ensembl=FALSE){
 
   gff = readGeneric(gff.file, 
                     chr=1,
@@ -500,15 +503,18 @@ gffToGRanges = function(gff.file, track.line=FALSE, split.group=FALSE, split.cha
   if(split.group){
     message('splitting the group.column...')
     group = strsplit(gff$group, '\\s+')
-    gnames = group[[1]][seq(1,length(group[[1]]),2)]
-    gids = seq(2,length(group[[1]]),2)
-    group = data.frame(do.call(rbind, 
-                              lapply(group, 
-                                     function(x)gsub(split.char,'',x[gids]))), 
-                       stringsAsFactors=FALSE)
-    colnames(group) = gnames
+    group = lapply(group, function(x){
+                              vals = x[seq(2,length(x),2)]
+                              vals = sub(split.char, '', vals)
+                              vals = sub('^"', '', vals)
+                              vals = sub('"$', '', vals)
+                              d = data.table(t(vals))
+                              data.table::setnames(d, x[seq(1,length(x),2)])
+                              d
+    })
+    group = data.table::rbindlist(group, fill=TRUE)
     gff$group = NULL
-    values(gff) = cbind(values(gff), group)
+    values(gff) = cbind(values(gff), as.data.frame(group))
   }
 
   if(!is.null(filter)){        
@@ -520,5 +526,8 @@ gffToGRanges = function(gff.file, track.line=FALSE, split.group=FALSE, split.cha
     }
   }
 
+  if(ensembl)
+    seqlevels(gff) = paste('chr',seqlevels(gff),sep='')
+
   return(gff)
 }
diff --git a/inst/unitTests/test.gtf b/inst/unitTests/test.gtf
@@ -0,0 +1,8 @@
+#!genome-build GRCh37.p13
+#!genome-version GRCh37
+#!genome-date 2009-02
+#!genome-build-accession NCBI:GCA_000001405.14
+#!genebuild-last-updated 2013-09
+1	pseudogene	gene	11869	14412	.	+	.	gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";
+1	processed_transcript	transcript	11869	14409	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";
+1	processed_transcript	exon	11869	12227	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; exon_id "ENSE00002234944";
diff --git a/inst/unitTests/test_readData.R b/inst/unitTests/test_readData.R
@@ -118,4 +118,18 @@ test_readGeneric = function()
   checkIdentical(g8, r10)
 }
 
+test_gffToGRanges = function()
+{
+  library(GenomicRanges)
+  tab.test = system.file('unitTests/test.gtf', package='genomation')
+  gff1 = gffToGRanges(tab.test, track.line='auto')
+  checkIdentical(length(gff1), 3L)
+  checkIdentical(ncol(values(gff1)), 5L)
+
+  gff2 = gffToGRanges(tab.test, track.line='auto', split.group=TRUE)
+  checkIdentical(ncol(values(gff2)), 13L)
+
+  gff3 = gffToGRanges(tab.test, track.line='auto',ensembl=TRUE)
+  checkIdentical(as.character(seqlevels(gff3)), 'chr1')
+}
 
diff --git a/man/gffToGRanges.Rd b/man/gffToGRanges.Rd
@@ -7,16 +7,18 @@
 The GenomicRanges object needs to be properly formated for the function to work.}
 \usage{
 gffToGRanges(gff.file, track.line = FALSE, split.group = FALSE,
-  split.char = ";", filter = NULL, zero.based = FALSE)
+  split.char = ";", filter = NULL, zero.based = FALSE, ensembl = FALSE)
 }
 \arguments{
 \item{gff.file}{path to a gff formatted file.
 The file can end in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}
 and/or start with \code{http://} or \code{ftp://}. If the file is not compressed
 it can also start with \code{https://} or \code{ftps://}.}
 
-\item{track.line}{the number of track lines to skip, "auto" to detect them automatically
-or FALSE(default) if the bed file doesn't have track lines}
+\item{track.line}{Can be an integer specifying the number of track lines to skip,
+"auto" to detect the header lines automatically
+ or FALSE(default) if the bed file doesn't have track lines.
+ "auto" detects both UCSC header lines and lines starting with #}
 
 \item{split.group}{boolean, whether to split the 9th column of the file}
 
@@ -25,6 +27,8 @@ or FALSE(default) if the bed file doesn't have track lines}
 \item{filter}{a character designating which elements to retain from the gff file (e.g. exon, CDS, ...)}
 
 \item{zero.based}{\code{boolean} whether the coordinates are 0 or 1 based. 0 is the default}
+
+\item{ensembl}{\code{boolean} if TRUE, add the chr prefix to seqlevels. FALSE by default}
 }
 \value{
 returns a \code{GenomicRanges} object