-
Notifications
You must be signed in to change notification settings - Fork 15
/
extendedMapSeqlevels.R
253 lines (237 loc) · 10.1 KB
/
extendedMapSeqlevels.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#' Change naming style for a set of sequence names
#'
#' If available, use the information from GenomeInfoDb for your species of
#' interest to map the sequence names from the style currently used to another
#' valid style. For example, for Homo sapiens map '2' (NCBI style) to 'chr2'
#' (UCSC style). If the information from GenomeInfoDb is not available, the
#' original sequence names will be returned. To disable this functionality
#' specify set `chrsStyle` to `NULL`.
#'
#' @param seqnames A character vector with the sequence names.
#' @param style A single character vector specifying the naming style to use
#' for renaming the sequence names.
#' @param species A single character vector with the species of interest: it has
#' to match the valid species names supported in GenomeInfoDb. See
#' `names(GenomeInfoDb::genomeStyles())`. If `species = NULL`,
#' a guess will be made using the available information in GenomeInfoDb.
#' @param currentStyle A single character vector with the currently used
#' naming style. If `NULL`, a guess will be made from the naming styles
#' supported by `species`.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{ If `TRUE` basic status updates will be printed along
#' the way.}
#' \item{chrsStyle }{ The naming style of the chromosomes. By default,
#' `UCSC`. See [seqlevelsStyle][GenomeInfoDb::seqlevelsStyle]. Set to `NULL` to
#' disable this function. This is used when `style` is missing, which is
#' normally the case when `extendedMapSeqlevels` is called by other
#' functions.}
#' }
#'
#' @return A vector of sequence names using the specified naming `style`.
#'
#' @author L. Collado-Torres
#' @export
#'
#' @details This function is inspired from [mapSeqlevels][GenomeInfoDb::seqlevelsStyle]
#' with the difference that it will return the original sequence names if
#' the species, current naming style or target naming style are not supported
#' in GenomeInfoDb.
#'
#' If you want to disable this function, set `chrsStyle` to `NULL`.
#' From other functions in derfinder that pass the
#' `...` argument to this function, use `chrsStyle = NULL`.
#' This can be useful when working
#' with organisms that are absent from GenomeInfoDb as documented in
#' <https://support.bioconductor.org/p/95521/>.
#'
#' @examples
#'
#'
#' ## Without guessing any information
#' extendedMapSeqlevels("2", "UCSC", "Homo sapiens", "NCBI")
#'
#' ## Guessing the current naming style
#' extendedMapSeqlevels("2", "UCSC", "Homo sapiens")
#'
#' ## Guess species and current style
#' extendedMapSeqlevels("2", "NCBI")
#'
#' ## Guess species while supplying the current style.
#' ## Probably an uncommon use-case
#' extendedMapSeqlevels("2", "NCBI", currentStyle = "TAIR10")
#'
#' ## Sequence names are unchanged when using an unsupported species
#' extendedMapSeqlevels("seq2", "NCBI", "toyOrganism")
#'
#' ## Disable extendedMapSeqlevels. This can be useful when working with
#' ## organisms that are not supported by GenomeInfoDb
#' chrs <- c(
#' "I", "II", "III", "IV", "IX", "V", "VI", "VII", "VIII", "X",
#' "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII"
#' )
#' extendedMapSeqlevels(chrs, chrsStyle = NULL)
#' \dontrun{
#' ## Set global species and style option
#' options("chrsStyle" = "UCSC")
#' options("species" = "homo_sapiens")
#'
#' ## Run using global options
#' extendedMapSeqlevels("2")
#' }
#'
extendedMapSeqlevels <- function(
seqnames, style = getOption(
"chrsStyle",
"UCSC"
), species = getOption("species", "homo_sapiens"),
currentStyle = NULL, ...) {
# @param verbose If \code{TRUE} basic status updates will be printed along the
# way.
verbose <- .advanced_argument("verbose", TRUE, ...)
if (missing(style)) {
# @param chrsStyle The naming style of the chromosomes. By default, UCSC. See
# \link[GenomeInfoDb]{seqlevelsStyle}.
style <- .advanced_argument("chrsStyle", "UCSC", ...)
}
if (is.null(style)) {
## If style is NULL return seqnames un-changed
return(seqnames)
}
## Check inputs
if (!is.character(seqnames)) {
seqnames <- as.character(seqnames)
}
stopifnot(is.character(style) & length(style) == 1)
if (!is.null(species)) stopifnot(is.character(species) & length(species) == 1)
if (!is.null(currentStyle)) stopifnot(is.character(currentStyle) & length(currentStyle) == 1)
## Was the species and/or the currentStyle guessed?
guessedSpecies <- is.null(species)
guessedCurrent <- is.null(currentStyle)
guessed <- guessedSpecies | guessedCurrent
if (guessed) {
guesses <- GenomeInfoDb:::.guessSpeciesStyle(seqnames)
if (any(is.na(guesses))) {
if (verbose) {
message("extendedMapSeqlevels: the 'seqnames' you supplied are currently not supported in GenomeInfoDb. Consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf")
}
return(seqnames)
}
}
## Extract valid mappings
supported <- GenomeInfoDb:::.supportedSeqnameMappings()
names(supported) <- tolower(names(supported))
## Guess species and/or currentStyle
if (guessedSpecies) {
if (guessedCurrent) {
## Guess current style
best <- .selectBestGuess(
seqnames, guesses$species, guesses$style,
supported
)
currentStyle <- best$style
} else {
## Check current style
idx <- grep(currentStyle, guesses$style)
if (length(idx) == 0) {
best$species <- NA
} else {
best < .selectBestGuess(
seqnames, guesses$species[idx],
guesses$style[idx], supported
)
}
}
species <- best$species
## Was able to guess the current species?
if (is.na(species)) {
if (verbose) {
message("extendedMapSeqlevels: the species could not be guessed. Consider supplying 'species'.")
}
return(seqnames)
}
} else {
## Format species name
species <- tolower(gsub(" ", "_", species))
## Check species is supported
if (length(which(names(supported) == species)) != 1) {
if (verbose) {
message(paste("extendedMapSeqlevels: the 'species'", species, "is currently not supported in GenomeInfoDb. Check valid 'species' by running names(GenomeInfoDb::genomeStyles()). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
}
return(seqnames)
}
if (guessedCurrent) {
## Guess current style
idx <- which(tolower(gsub(" ", "_", guesses$species)) == species)
if (length(idx) == 0) {
currentStyle <- NA
} else {
currentStyle <- .selectBestGuess(
seqnames, guesses$species[idx],
guesses$style[idx], supported
)$style
}
}
}
## Was able to guesss the current style?
if (is.na(currentStyle) & guessedCurrent) {
if (verbose) {
message("extendedMapSeqlevels: the current naming style could not be guessed. Consider supplying 'currentStyle'.")
}
return(seqnames)
}
## Valid mapping for the species
mapping <- supported[[which(names(supported) == species)]]
## Check style is supported
j <- grep(tolower(style), tolower(colnames(mapping)))
if (length(j) != 1) {
if (verbose) {
message(paste("extendedMapSeqlevels: the 'style'", style, "is currently not supported for the 'species'", species, "in GenomeInfoDb. Check valid naming styles by running GenomeInfoDb::genomeStyles(species). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
}
return(seqnames)
}
## Is the target style the same as the current style?
if (tolower(style) == tolower(currentStyle)) {
return(seqnames)
}
## Find current naming map
k <- grep(tolower(currentStyle), tolower(colnames(mapping)))
## Check currentStyle if it was supplied
if (!guessedCurrent) {
if (length(k) != 1) {
if (verbose) {
message(paste("extendedMapSeqlevels: the 'currentStyle'", currentStyle, "is currently not supported for the 'species'", species, "in GenomeInfoDb. Check valid naming styles by running GenomeInfoDb::genomeStyles(species). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
}
return(seqnames)
}
}
## Make the map
seqnames <- mapping[match(seqnames, mapping[, k]), j]
if (verbose & guessed) {
message(paste("extendedMapSeqlevels: sequence names mapped from", currentStyle, "to", style, "for species", species))
}
return(seqnames)
}
.selectBestGuess <- function(seqnames, organisms, styles, supported) {
organisms <- tolower(gsub(" ", "_", organisms))
seqnames <- tolower(seqnames)
if (length(organisms) == 1 & length(styles) == 1) {
res <- list("species" = organisms, "style" = styles)
} else {
scores <- mapply(function(org, sty) {
i <- which(names(supported) == org)
if (length(i) != 1) {
return(0)
}
k <- grep(sty, colnames(supported[[i]]))
sum(seqnames %in% tolower(supported[[i]][, k[1]]))
}, organisms, styles)
if (max(scores) == 0) {
res <- list("species" = NA, "style" = NA)
} else {
res <- list("species" = organisms[which.max(scores)], "style" = styles[which.max(scores)])
}
}
return(res)
}