update to MSigDB 7.1

igordot · May 14, 2020 · 9ed2768 · 9ed2768
1 parent bdb19c8
commit 9ed2768
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 73 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: msigdbr
 Type: Package
 Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
-Version: 7.0.1
+Version: 7.1.1
 Authors@R: person("Igor", "Dolgalev", email = "igor.dolgalev@nyumc.org", role = c("aut", "cre"))
 Description: Provides the 'Molecular Signatures Database' (MSigDB) gene sets
     typically used with the 'Gene Set Enrichment Analysis' (GSEA) software
@@ -27,5 +27,5 @@ Suggests:
     knitr,
     rmarkdown
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
 VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# msigdbr 7.1.1
+
+* Based on MSigDB v7.1 release.
+* Increased ortholog prediction stringency.
+
 # msigdbr 7.0.1
 
 * Based on MSigDB v7.0 release.

diff --git a/R/functions.R b/R/functions.R
@@ -1,70 +1,82 @@
 
 #' List the species available in the msigdbr package
 #'
-#' @return a vector of possible species
+#' @return A vector of possible species.
+#'
 #' @importFrom dplyr pull
 #' @export
+#'
+#' @examples
+#' msigdbr_show_species()
 msigdbr_show_species <- function() {
-
-  msigdbr_orthologs %>% pull(.data$species_name) %>% unique() %>% sort()
-
+  msigdbr_orthologs %>%
+    pull(.data$species_name) %>%
+    unique() %>%
+    sort()
 }
 
 #' Retrieve the msigdbr data frame
 #'
-#' @param species species name, such as Homo sapiens, Mus musculus, etc.
-#' @param category collection, such as H, C1, C2, C3, C4, C5, C6, C7.
-#' @param subcategory sub-collection, such as CGP, MIR, BP, etc.
+#' @param species Species name, such as Homo sapiens or Mus musculus. The available species can be retrieved with `msigdbr_show_species()`.
+#' @param category MSigDB collection abbreviation, such as H, C1, C2, C3, C4, C5, C6, C7.
+#' @param subcategory MSigDB sub-collection abbreviation, such as CGP or BP.
+#'
+#' @return A data frame of gene sets with one gene per row.
+#'
+#' @references \url{https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp}
 #'
-#' @return a data frame of gene sets with one gene per row
 #' @import tibble
 #' @importFrom dplyr filter inner_join arrange select
 #' @export
 #'
 #' @examples
-#' # all human gene sets
-#' m = msigdbr(species = "Homo sapiens")
+#' # get all human gene sets
+#' \donttest{msigdbr(species = "Homo sapiens")}
 #'
-#' # mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets
-#' m = msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP")
+#' # get mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets
+#' \donttest{msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP")}
+#'
+#' # check all the available categories and sub-categories
+#' \donttest{msigdbr() %>% dplyr::distinct(gs_cat, gs_subcat) %>% dplyr::arrange(gs_cat, gs_subcat)}
 msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NULL) {
 
-  # filter by species
-  orthologs_subset = filter(msigdbr_orthologs, .data$species_name == species)
-
+  # confirm that only one species is specified
   if (length(species) > 1) {
     stop("please specify only one species at a time")
   }
 
+  # filter orthologs by species
+  orthologs_subset <- filter(msigdbr_orthologs, .data$species_name == species)
+
   # confirm that the species exists in the database
   if (nrow(orthologs_subset) == 0) {
     stop("species does not exist in the database: ", species)
   }
 
-  genesets_subset = msigdbr_genesets
+  genesets_subset <- msigdbr_genesets
 
   # filter by category
   if (is.character(category)) {
     if (length(category) > 1) {
       stop("please specify only one category at a time")
     }
-    genesets_subset = filter(genesets_subset, .data$gs_cat == category)
+    genesets_subset <- filter(genesets_subset, .data$gs_cat == category)
   }
 
   # filter by sub-category
   if (is.character(subcategory)) {
     if (length(subcategory) > 1) {
       stop("please specify only one subcategory at a time")
     }
-    genesets_subset = filter(genesets_subset, .data$gs_subcat == subcategory)
+    genesets_subset <- filter(genesets_subset, .data$gs_subcat == subcategory)
   }
 
+  # combine gene sets and genes
+  genesets_subset <- inner_join(genesets_subset, msigdbr_genes, by = "gs_id")
+
   # combine gene sets and orthologs
   genesets_subset %>%
     inner_join(orthologs_subset, by = "human_entrez_gene") %>%
     arrange(.data$gs_name, .data$human_gene_symbol) %>%
     select(-.data$human_entrez_gene, -.data$num_sources)
-
 }
-
-
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/cran-comments.md b/cran-comments.md
@@ -0,0 +1,21 @@
+## Test environments
+* local R installation, R 4.0.0
+* ubuntu 16.04 (on travis-ci), R 4.0.0
+* win-builder (devel)
+
+## R CMD check results
+
+0 errors | 0 warnings | 1 note
+
+## revdepcheck results
+
+We checked 5 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
+
+* We saw 0 new problems
+* We failed to check 0 packages
+
+## Resubmission
+
+This is a resubmission. In this version I have:
+
+* Addressed the elapsed time notes.
diff --git a/data-raw/msigdbr-prepare.R b/data-raw/msigdbr-prepare.R
@@ -1,6 +1,7 @@
 
 library(dplyr)
 library(tidyr)
+library(purrr)
 library(readr)
 library(stringr)
 library(glue)
@@ -9,25 +10,27 @@ library(usethis)
 
 # Import MSigDB gene sets -------------------------------------------------
 
-# Download the MSigDB XML file
-msigdb_version = "7.0"
-msigdb_xml = glue("msigdb_v{msigdb_version}.xml")
-msigdb_url_base = "http://software.broadinstitute.org/gsea/msigdb/download_file.jsp?filePath=/resources/msigdb"
-msigdb_xml_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}.xml")
-download.file(
-  url = msigdb_xml_url, destfile = msigdb_xml, quiet = TRUE,
-  method = "wget",
-  extra = "-c --header='Cookie: JSESSIONID=94C597E7CE4BCEF65ADE10782AD067AD'"
-)
+# Define MSigDB download variables
+msigdb_version = "7.1"
+msigdb_url_base = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
+msigdb_zip_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}_files_to_download_locally.zip")
+msigdb_dir = glue("msigdb_v{msigdb_version}_files_to_download_locally")
+
+# Download the MSigDB zip file
+download.file(url = msigdb_zip_url, destfile = glue("{msigdb_dir}.zip"))
 
 # Check MSigDB XML file size in bytes
-utils:::format.object_size(file.size(msigdb_xml), units = "auto")
+utils:::format.object_size(file.size(glue("{msigdb_dir}.zip")), units = "auto")
+
+# Extract the MSigDB zip file
+unzip(glue("{msigdb_dir}.zip"))
 
 # Import the MSigDB XML file
-msigdb_doc = read_xml(msigdb_xml)
+msigdb_doc = read_xml(glue("{msigdb_dir}/msigdb_v{msigdb_version}.xml"))
 
-# Delete the MSigDB XML file since it is no longer needed
-file.remove(msigdb_xml)
+# Delete the MSigDB zip file and its contents since they are no longer needed
+file.remove(glue("{msigdb_dir}.zip"))
+unlink(msigdb_dir, recursive = TRUE)
 
 # Extract the attributes and convert into a tibble
 # GENESET record attributes:
@@ -50,16 +53,21 @@ msigdbr_genesets =
   ) %>%
   filter(gs_cat != "ARCHIVED")
 
+# Separate genes and gene sets
+msigdbr_genes = msigdbr_genesets %>% select(gs_id, gs_members)
+msigdbr_genesets = msigdbr_genesets %>% distinct(gs_id, gs_name, gs_cat, gs_subcat)
+msigdbr_genesets = msigdbr_genesets %>% arrange(gs_name, gs_id)
+
 # Check the number of gene sets per category
-msigdbr_genesets %>% count(gs_cat) %>% arrange(gs_cat)
+msigdbr_genesets %>% count(gs_cat, gs_subcat) %>% arrange(gs_cat)
 
 # Convert to "long" format (one gene per row)
-msigdbr_genesets =
-  msigdbr_genesets %>%
-  mutate(members = strsplit(gs_members, "\\|")) %>%
-  unnest(members) %>%
+msigdbr_genes = mutate(msigdbr_genes, gs_members_split = strsplit(gs_members, "|", fixed = TRUE))
+msigdbr_genes = unnest(msigdbr_genes, cols = gs_members_split, names_repair = "minimal")
+msigdbr_genes =
+  msigdbr_genes %>%
   separate(
-    col = members,
+    col = gs_members_split,
     into = c("source_gene", "human_gene_symbol", "human_entrez_gene"),
     sep = ","
   ) %>%
@@ -68,7 +76,7 @@ msigdbr_genesets =
 
 # Create a table of human genes based on MSigDB gene mappings
 human_tbl =
-  msigdbr_genesets %>%
+  msigdbr_genes %>%
   select(human_entrez_gene, human_gene_symbol) %>%
   distinct() %>%
   mutate(
@@ -78,21 +86,26 @@ human_tbl =
   )
 
 # Clean up
-msigdbr_genesets =
-  msigdbr_genesets %>%
-  select(gs_name, gs_id, gs_cat, gs_subcat, human_entrez_gene) %>%
-  arrange(gs_name, gs_id, human_entrez_gene) %>%
+msigdbr_genes =
+  msigdbr_genes %>%
+  select(gs_id, human_entrez_gene) %>%
+  arrange(gs_id, human_entrez_gene) %>%
   distinct()
 
 # Get a list of all MSigDB genes (Entrez IDs)
-msigdb_entrez_genes = msigdbr_genesets %>% pull(human_entrez_gene) %>% sort() %>% unique()
+msigdb_entrez_genes = msigdbr_genes %>% pull(human_entrez_gene) %>% sort() %>% unique()
 
 # Import HCOP orthologs ---------------------------------------------------
 
 # Import the human and ortholog data from all HCOP species
 hcop_txt_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz"
 hcop = read_tsv(hcop_txt_url)
 
+# Remove repeating databases (some are listed multiple times)
+hcop = mutate(hcop, support = strsplit(support, ",", fixed = TRUE))
+hcop = mutate(hcop, support = map(support, unique))
+hcop = mutate(hcop, support = map_chr(support, paste, collapse = ","))
+
 # Keep only the genes found in MSigDB and those in multiple ortholog/homolog databases
 msigdbr_orthologs =
   hcop %>%
@@ -116,9 +129,12 @@ msigdbr_orthologs =
   ) %>%
   filter(
     human_entrez_gene %in% msigdb_entrez_genes,
-    num_sources > 1
+    num_sources > 2
   )
 
+# List the number of supporting sources
+table(msigdbr_orthologs$num_sources, useNA = "ifany")
+
 # Names and IDs of common species
 species_tbl =
   tibble(species_id = integer(), species_name = character()) %>%
@@ -139,14 +155,21 @@ msigdbr_orthologs %>% pull(species_id) %>% unique() %>% sort()
 # Add species names
 msigdbr_orthologs = inner_join(species_tbl, msigdbr_orthologs, by = "species_id")
 
-# For each gene, only keep the best ortholog (found in the most databases)
+# For each human gene, only keep the best ortholog (found in the most databases)
 msigdbr_orthologs =
   msigdbr_orthologs %>%
   select(-species_id) %>%
   group_by(human_entrez_gene, species_name) %>%
   top_n(1, num_sources) %>%
   ungroup()
 
+# For each human gene, ignore ortholog pairs with many orthologs
+msigdbr_orthologs =
+  msigdbr_orthologs %>%
+  add_count(human_entrez_gene, species_name) %>%
+  filter(n <= 3) %>%
+  select(-n)
+
 # Add human genes from the original genesets table to the orthologs table
 msigdbr_orthologs =
   msigdbr_orthologs %>%
@@ -155,13 +178,28 @@ msigdbr_orthologs =
     human_entrez_gene, human_gene_symbol,
     species_name, entrez_gene, gene_symbol, sources, num_sources
   ) %>%
-  arrange(species_name, human_gene_symbol) %>%
+  arrange(human_gene_symbol, human_entrez_gene, species_name) %>%
   distinct()
 
+# Show the orthologs summary stats
+hcop %>%
+  inner_join(species_tbl, by = c("ortholog_species" = "species_id")) %>%
+  group_by(species_name) %>%
+  summarize(n_distinct(ortholog_species_symbol))
+msigdbr_orthologs %>%
+  group_by(species_name) %>%
+  summarize(n_distinct(human_gene_symbol), n_distinct(gene_symbol), max(num_sources))
+
 # Prepare package ---------------------------------------------------------
 
+# Check the size of final tables
+format(object.size(msigdbr_genes), units = "Mb")
+format(object.size(msigdbr_genesets), units = "Mb")
+format(object.size(msigdbr_orthologs), units = "Mb")
+
 # Create package data
 use_data(
+  msigdbr_genes,
   msigdbr_genesets,
   msigdbr_orthologs,
   internal = TRUE,

diff --git a/man/msigdbr.Rd b/man/msigdbr.Rd
diff --git a/man/msigdbr_show_species.Rd b/man/msigdbr_show_species.Rd