Skip to content

Commit

Permalink
v1.11.5 -- also use retries for geo_info()
Browse files Browse the repository at this point in the history
  • Loading branch information
lcolladotor committed Jun 10, 2019
1 parent 37ab3f3 commit 8da982b
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 29 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: recount
Title: Explore and download data from the recount project
Version: 1.11.4
Version: 1.11.5
Date: 2019-06-10
Authors@R: c(person("Leonardo", "Collado-Torres", role = c("aut", "cre"),
email = "lcolladotor@gmail.com", comment = c(ORCID = "0000-0003-2140-308X")),
Expand Down
64 changes: 36 additions & 28 deletions R/geo_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
#' This function uses GEOquery to extract information for a given sample. The
#' GEO accession ids for the sample can be found in the study phenotype table.
#'
#' @return Returns a \link[S4Vectors]{DataFrame-class} with the information
#' @return Returns a \link[S4Vectors]{DataFrame-class} with the information
#' from GEO available for the given sample.
#'
#' @param geoid A character vector of length 1 with the GEO accession id for
#' a given sample.
#' @param verbose If \code{TRUE} the \code{geoid} will be shown.
#' @param verbose If \code{TRUE} the \code{geoid} will be shown.
#' @param sleep The number of seconds (or fraction) to wait before downloading
#' data using \link[GEOquery]{getGEO}. This is important if you are looking over
#' \code{geo_info()} given the constraints published at
Expand All @@ -30,41 +30,49 @@
geo_info <- function(geoid, verbose = FALSE, sleep = 1/2, getGPL = FALSE,
destdir = tempdir(), ...) {
if(is.na(geoid)) return(NULL)

## Check inputs
stopifnot(is.character(geoid) & length(geoid) == 1)

if(verbose) message(paste(Sys.time(),
'finding GEO information for GEO accession id', geoid))

if(!file.exists(file.path(destdir, paste0(geoid, '.soft')))) {
Sys.sleep(sleep)
}

## Get data from GEO
geo <- tryCatch(
GEOquery::getGEO(geoid, getGPL = getGPL, destdir = destdir, ...),
error = function(e) {
soft <- paste0(geoid, '.soft')
soft_file <- file.path(destdir, soft)
if(any(grepl('private', readLines(soft_file)))) {
message(paste(geoid, 'is currently private'))
return(NA)
} else if (any(grepl('blocked', readLines(soft_file)))) {
warning(paste('It seems like your IP access is blocked. Please check the file', soft_file, 'for more details.'))
return(NA)
} else {
stop(e)

## Get data from GEO, with 3 retries, waiting between 0 and 2 seconds in
## between retries
N.TRIES <- 3L
while(N.TRIES > 0L) {
geo <- tryCatch(
GEOquery::getGEO(geoid, getGPL = getGPL, destdir = destdir, ...),
error = function(e) {
soft <- paste0(geoid, '.soft')
soft_file <- file.path(destdir, soft)
if(any(grepl('private', readLines(soft_file)))) {
message(paste(geoid, 'is currently private'))
return(NA)
} else if (any(grepl('blocked', readLines(soft_file)))) {
warning(paste('It seems like your IP access is blocked. Please check the file', soft_file, 'for more details.'))
return(NA)
} else {
return(e)
}
}
}
)

)
if(!inherits(geo, 'error'))
break
Sys.sleep(runif(n = 1, min = 0, max = 2))
N.TRIES <- N.TRIES - 1L
}

## Return and empty DataFrame if there was an issue with getGEO()
if(!is(geo, 'GSM')) return(S4Vectors::DataFrame())

## Extract the header information
result <- geo@header

## Function for cleaning
clean_geo <- function(pattern, varname, res) {
charIndex <- grep(pattern, names(res))
Expand All @@ -76,7 +84,7 @@ geo_info <- function(geoid, verbose = FALSE, sleep = 1/2, getGPL = FALSE,
}
return(res)
}

## Clean up the header information
df <- data.frame(
pattern = c('characteristics_ch1', 'data_processing', 'contact_',
Expand All @@ -85,15 +93,15 @@ geo_info <- function(geoid, verbose = FALSE, sleep = 1/2, getGPL = FALSE,
varname = c('characteristics', 'data_processing', 'contact', 'extract',
'library', 'relation', 'series', 'supplementary_file'),
stringsAsFactors = FALSE
)
)
for(i in seq_len(nrow(df))) result <- clean_geo(df$pattern[i],
df$varname[i], result)

## Make sure they are all length 1
if(any(S4Vectors::elementNROWS(result) > 1)) {
for(i in which(S4Vectors::elementNROWS(result) > 1)) result[i] <- IRanges::CharacterList(unlist(unname(result[i])))
}

## Finish
return(S4Vectors::DataFrame(result))
}

0 comments on commit 8da982b

Please sign in to comment.