-
Notifications
You must be signed in to change notification settings - Fork 1
/
get-msigdb.R
187 lines (170 loc) · 7.42 KB
/
get-msigdb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#' Fetches gene set collections from the moleular signature database (MSigDB)
#'
#' This provides versioned genesets from gene set collections defined in
#' [MSigDB](http://software.broadinstitute.org/gsea/msigdb). Collections can
#' be retrieved by their collection name, ie `c("H", "C2", "C7")`.
#'
#' @section Species and Identifier types:
#' This function utilizes the functionality from the `{msigdbr}` and
#' `{babelgene}` packages to retrieve gene set definitions from a variety of
#' organisms and identifier types.
#'
#' @section KEGG Gene Sets:
#' Due to the licensing restrictions over the KEGG collections, they are not
#' returned from this function unless they are explicitly asked for. You can
#' ask for them through this function by either (i) querying for the `"c2"`
#' collection while setting `with.kegg = TRUE`; or (ii) explicitly calling with
#' `collection = "kegg"`.
#'
#' @section Citing the Molecular Signatures Database:
#' To cite your use of the Molecular Signatures Database (MSigDB), please
#' reference Subramanian, Tamayo, et al. (2005, PNAS 102, 15545-15550) and one
#' or more of the following as appropriate:
#'
#' * Liberzon, et al. (2011, Bionformatics);
#' * Liberzon, et al. (2015, Cell Systems); and
#' * The source for the gene set as listed on the gene set page.
#'
#' @export
#'
#' @param collection character vector specifying the collections you want
#' (c1, c2, ..., c7, h). By default we load just the hallmark collecitons.
#' Setting this to `NULL` loads all collections. Alternative you can also
#' include named subsets of collections, like `"reactome"`. Refer to the
#' Details section for more information.
#' @param species `"human"` or `"mouse"`? Really, this is anything available
#' in the `alias` column of the `sparrow:::species_info()` table (except
#' cyno).
#' @param id.type do you want the feature id's used in the gene sets to be
#' `"ensembl"` (default), `"entrez"`, or `"symbol"`.
#' @param with.kegg The Broad distributes the latest versions of the KEGG
#' genesets as part of the c2 collection. These genesets come with a
#' restricted license, so by default we do not return them as part of the
#' GeneSetDb. To include the KEGG gene sets when asking for the c2
#' collection, set this flag to `TRUE`.
#' @param promote.subcategory.to.collection there are different sources of
#' genesets for a number of the collections in MSigDB. These are included
#' in the `gs_subcat` column of `geneSets(this)`. When this is set to `TRUE`,
#' the collection column for the genesets is appended with the subcatory.
#' So, instead of having a massive `"C2"` collection, you'll have bunch of
#' collections like `"C2_CGP"`, `"C2_CP:BIOCARTA"`, etc.
#' @param prefix.collection When `TRUE` (default: `FALSE`), the `"C1"`, `"C2"`,
#' etc. is prefixed with `"MSigDB_*"`
#' @param ... pass through parameters
#' @return a `BiocSet` of the MSigDB collections
#' @examples
#' \donttest{
#' # these take a while to load initially, so put them in dontrun blocks.
#' # you should run these interactively to understand what they return
#' bcs <- getMSigCollection("h", "human", "entrez")
#' bcs.h.entrez <- getMSigCollection(c("h", "c2"), "human", "entrez")
#' bcs.h.ens <- getMSigCollection(c("h", "c2"), "human", "ensembl")
#' bcs.m.entrez <- getMSigCollection(c("h", "c2"), "mouse", "entrez")
#'
#' gdb <- getMSigGeneSetDb("h", "human", "entrez")
#' }
getMSigCollection <- function(collection = NULL,
species = "human",
id.type = c("ensembl", "entrez", "symbol"),
with.kegg = FALSE,
promote.subcategory.to.collection = FALSE,
prefix.collection = FALSE, ...) {
id.type <- match.arg(id.type)
out <- getMSigGeneSetDb(
collection, species, id.type, with.kegg,
promote.subcategory.to.collection, prefix.collection, ...)
as(out, "BiocSet")
}
#' @describeIn getMSigCollection retrieval method for a GeneSetDb container
#' @export
getMSigGeneSetDb <- function(collection = NULL,
species = "human",
id.type = c("ensembl", "entrez", "symbol"),
with.kegg = FALSE,
promote.subcategory.to.collection = FALSE,
prefix.collection = FALSE, ...) {
if (!requireNamespace("msigdbr", quietly = TRUE)) {
stop("The msigdbr package is required for this functionality")
}
id.type <- match.arg(id.type)
species.info <- species_info(species)
valid.cols <- c("H", paste0("C", 1:8))
if (species.info$alias != "human") valid.cols <- setdiff(valid.cols, "C1")
if (!is.null(collection)) {
collection <- assert_subset(toupper(collection), valid.cols)
}
# handle non std eval NOTE in R CMD check when using `:=` mojo
# each of these variables are referenced in some data.table NSE mojo below
gs_cat <- gs_subcat <- gs_name <- symbol <- ensembl_id <- gs_id <- NULL
sigs.all <- data.table::copy(.pkgcache[["msigdb"]][[species.info$species]])
if (is.null(sigs.all)) {
sigs.all <- msigdbr::msigdbr(species.info$species)
sigs.all <- as.data.table(sigs.all)
axe.cols <- c("gs_pmid", "gs_geoid", "gs_url",
"gs_description", "species_name", "species_common_name",
"ortholog_sources", "num_ortholog_sources")
axe.cols <- intersect(axe.cols, colnames(sigs.all))
for (axe in axe.cols) sigs.all[, (axe) := NULL]
setkeyv(sigs.all, c("gs_cat", "gs_name"))
.pkgcache[["msigdb"]][[species.info$species]] <- data.table::copy(sigs.all)
}
if (!is.null(collection)) {
out <- sigs.all[gs_cat %in% collection]
} else {
out <- sigs.all
}
if (!with.kegg) {
out <- out[gs_subcat != "CP:KEGG"]
}
if (prefix.collection) {
out[, collection := paste0("MSigDB_", out$gs_cat)]
} else {
out[, collection := gs_cat]
}
if (promote.subcategory.to.collection) {
out[, collection := {
ifelse(nchar(out$gs_subcat) == 0L,
out$collection,
paste(out$collection, out$gs_subcat, sep = "_"))
}]
}
if (id.type == "ensembl") {
idtype <- GSEABase::ENSEMBLIdentifier()
idcol <- "ensembl_gene"
} else if (id.type == "entrez") {
idtype <- GSEABase::EntrezIdentifier()
idcol <- "entrez_gene"
} else {
idtype <- GSEABase::SymbolIdentifier()
idcol <- "gene_symbol"
}
ret <- out[, {
list(collection, name = gs_name, feature_id = as.character(.SD[[idcol]]),
subcategory = gs_subcat)
}, .SDcols = c("collection", "gs_name", idcol)]
if (id.type != "symbol") {
ret[, symbol := out[["gene_symbol"]]]
} else {
ret[, ensembl_id := out[["ensembl_gene"]]]
}
ret[, gs_id := {
ifelse(grepl("GO:", out$gs_exact_source), out$gs_exact_source, out$gs_id)
}]
ret <- ret[!is.na(feature_id)]
ret <- unique(ret, by = c("collection", "name", "feature_id"))
gdb <- GeneSetDb(ret)
for (col in unique(geneSets(gdb)$collection)) {
geneSetCollectionURLfunction(gdb, col) <- ".geneSetURL.msigdb"
featureIdType(gdb, col) <- idtype
gdb <- addCollectionMetadata(
gdb, col, 'source', as.character(packageVersion("msigdbr")))
}
# org(gdb) <- gsub(" ", "_", species.info[["species"]])
gdb@collectionMetadata <- gdb@collectionMetadata[name != "count"]
gdb
}
#' @noRd
.geneSetURL.msigdb <- function(collection, name, ...) {
url <- "http://www.broadinstitute.org/gsea/msigdb/cards/%s.html"
sprintf(url, name)
}