Skip to content

Commit

Permalink
update to MSigDB 7.1
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed May 14, 2020
1 parent bdb19c8 commit 9ed2768
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 73 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: msigdbr
Type: Package
Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
Version: 7.0.1
Version: 7.1.1
Authors@R: person("Igor", "Dolgalev", email = "igor.dolgalev@nyumc.org", role = c("aut", "cre"))
Description: Provides the 'Molecular Signatures Database' (MSigDB) gene sets
typically used with the 'Gene Set Enrichment Analysis' (GSEA) software
Expand All @@ -27,5 +27,5 @@ Suggests:
knitr,
rmarkdown
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
VignetteBuilder: knitr
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# msigdbr 7.1.1

* Based on MSigDB v7.1 release.
* Increased ortholog prediction stringency.

# msigdbr 7.0.1

* Based on MSigDB v7.0 release.
Expand Down
54 changes: 33 additions & 21 deletions R/functions.R
Original file line number Diff line number Diff line change
@@ -1,70 +1,82 @@

#' List the species available in the msigdbr package
#'
#' @return a vector of possible species
#' @return A vector of possible species.
#'
#' @importFrom dplyr pull
#' @export
#'
#' @examples
#' msigdbr_show_species()
msigdbr_show_species <- function() {

msigdbr_orthologs %>% pull(.data$species_name) %>% unique() %>% sort()

msigdbr_orthologs %>%
pull(.data$species_name) %>%
unique() %>%
sort()
}

#' Retrieve the msigdbr data frame
#'
#' @param species species name, such as Homo sapiens, Mus musculus, etc.
#' @param category collection, such as H, C1, C2, C3, C4, C5, C6, C7.
#' @param subcategory sub-collection, such as CGP, MIR, BP, etc.
#' @param species Species name, such as Homo sapiens or Mus musculus. The available species can be retrieved with `msigdbr_show_species()`.
#' @param category MSigDB collection abbreviation, such as H, C1, C2, C3, C4, C5, C6, C7.
#' @param subcategory MSigDB sub-collection abbreviation, such as CGP or BP.
#'
#' @return A data frame of gene sets with one gene per row.
#'
#' @references \url{https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp}
#'
#' @return a data frame of gene sets with one gene per row
#' @import tibble
#' @importFrom dplyr filter inner_join arrange select
#' @export
#'
#' @examples
#' # all human gene sets
#' m = msigdbr(species = "Homo sapiens")
#' # get all human gene sets
#' \donttest{msigdbr(species = "Homo sapiens")}
#'
#' # mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets
#' m = msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP")
#' # get mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets
#' \donttest{msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP")}
#'
#' # check all the available categories and sub-categories
#' \donttest{msigdbr() %>% dplyr::distinct(gs_cat, gs_subcat) %>% dplyr::arrange(gs_cat, gs_subcat)}
msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NULL) {

# filter by species
orthologs_subset = filter(msigdbr_orthologs, .data$species_name == species)

# confirm that only one species is specified
if (length(species) > 1) {
stop("please specify only one species at a time")
}

# filter orthologs by species
orthologs_subset <- filter(msigdbr_orthologs, .data$species_name == species)

# confirm that the species exists in the database
if (nrow(orthologs_subset) == 0) {
stop("species does not exist in the database: ", species)
}

genesets_subset = msigdbr_genesets
genesets_subset <- msigdbr_genesets

# filter by category
if (is.character(category)) {
if (length(category) > 1) {
stop("please specify only one category at a time")
}
genesets_subset = filter(genesets_subset, .data$gs_cat == category)
genesets_subset <- filter(genesets_subset, .data$gs_cat == category)
}

# filter by sub-category
if (is.character(subcategory)) {
if (length(subcategory) > 1) {
stop("please specify only one subcategory at a time")
}
genesets_subset = filter(genesets_subset, .data$gs_subcat == subcategory)
genesets_subset <- filter(genesets_subset, .data$gs_subcat == subcategory)
}

# combine gene sets and genes
genesets_subset <- inner_join(genesets_subset, msigdbr_genes, by = "gs_id")

# combine gene sets and orthologs
genesets_subset %>%
inner_join(orthologs_subset, by = "human_entrez_gene") %>%
arrange(.data$gs_name, .data$human_gene_symbol) %>%
select(-.data$human_entrez_gene, -.data$num_sources)

}


Binary file modified R/sysdata.rda
Binary file not shown.
21 changes: 21 additions & 0 deletions cran-comments.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Test environments
* local R installation, R 4.0.0
* ubuntu 16.04 (on travis-ci), R 4.0.0
* win-builder (devel)

## R CMD check results

0 errors | 0 warnings | 1 note

## revdepcheck results

We checked 5 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.

* We saw 0 new problems
* We failed to check 0 packages

## Resubmission

This is a resubmission. In this version I have:

* Addressed the elapsed time notes.
96 changes: 67 additions & 29 deletions data-raw/msigdbr-prepare.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(stringr)
library(glue)
Expand All @@ -9,25 +10,27 @@ library(usethis)

# Import MSigDB gene sets -------------------------------------------------

# Download the MSigDB XML file
msigdb_version = "7.0"
msigdb_xml = glue("msigdb_v{msigdb_version}.xml")
msigdb_url_base = "http://software.broadinstitute.org/gsea/msigdb/download_file.jsp?filePath=/resources/msigdb"
msigdb_xml_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}.xml")
download.file(
url = msigdb_xml_url, destfile = msigdb_xml, quiet = TRUE,
method = "wget",
extra = "-c --header='Cookie: JSESSIONID=94C597E7CE4BCEF65ADE10782AD067AD'"
)
# Define MSigDB download variables
msigdb_version = "7.1"
msigdb_url_base = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
msigdb_zip_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}_files_to_download_locally.zip")
msigdb_dir = glue("msigdb_v{msigdb_version}_files_to_download_locally")

# Download the MSigDB zip file
download.file(url = msigdb_zip_url, destfile = glue("{msigdb_dir}.zip"))

# Check MSigDB XML file size in bytes
utils:::format.object_size(file.size(msigdb_xml), units = "auto")
utils:::format.object_size(file.size(glue("{msigdb_dir}.zip")), units = "auto")

# Extract the MSigDB zip file
unzip(glue("{msigdb_dir}.zip"))

# Import the MSigDB XML file
msigdb_doc = read_xml(msigdb_xml)
msigdb_doc = read_xml(glue("{msigdb_dir}/msigdb_v{msigdb_version}.xml"))

# Delete the MSigDB XML file since it is no longer needed
file.remove(msigdb_xml)
# Delete the MSigDB zip file and its contents since they are no longer needed
file.remove(glue("{msigdb_dir}.zip"))
unlink(msigdb_dir, recursive = TRUE)

# Extract the attributes and convert into a tibble
# GENESET record attributes:
Expand All @@ -50,16 +53,21 @@ msigdbr_genesets =
) %>%
filter(gs_cat != "ARCHIVED")

# Separate genes and gene sets
msigdbr_genes = msigdbr_genesets %>% select(gs_id, gs_members)
msigdbr_genesets = msigdbr_genesets %>% distinct(gs_id, gs_name, gs_cat, gs_subcat)
msigdbr_genesets = msigdbr_genesets %>% arrange(gs_name, gs_id)

# Check the number of gene sets per category
msigdbr_genesets %>% count(gs_cat) %>% arrange(gs_cat)
msigdbr_genesets %>% count(gs_cat, gs_subcat) %>% arrange(gs_cat)

# Convert to "long" format (one gene per row)
msigdbr_genesets =
msigdbr_genesets %>%
mutate(members = strsplit(gs_members, "\\|")) %>%
unnest(members) %>%
msigdbr_genes = mutate(msigdbr_genes, gs_members_split = strsplit(gs_members, "|", fixed = TRUE))
msigdbr_genes = unnest(msigdbr_genes, cols = gs_members_split, names_repair = "minimal")
msigdbr_genes =
msigdbr_genes %>%
separate(
col = members,
col = gs_members_split,
into = c("source_gene", "human_gene_symbol", "human_entrez_gene"),
sep = ","
) %>%
Expand All @@ -68,7 +76,7 @@ msigdbr_genesets =

# Create a table of human genes based on MSigDB gene mappings
human_tbl =
msigdbr_genesets %>%
msigdbr_genes %>%
select(human_entrez_gene, human_gene_symbol) %>%
distinct() %>%
mutate(
Expand All @@ -78,21 +86,26 @@ human_tbl =
)

# Clean up
msigdbr_genesets =
msigdbr_genesets %>%
select(gs_name, gs_id, gs_cat, gs_subcat, human_entrez_gene) %>%
arrange(gs_name, gs_id, human_entrez_gene) %>%
msigdbr_genes =
msigdbr_genes %>%
select(gs_id, human_entrez_gene) %>%
arrange(gs_id, human_entrez_gene) %>%
distinct()

# Get a list of all MSigDB genes (Entrez IDs)
msigdb_entrez_genes = msigdbr_genesets %>% pull(human_entrez_gene) %>% sort() %>% unique()
msigdb_entrez_genes = msigdbr_genes %>% pull(human_entrez_gene) %>% sort() %>% unique()

# Import HCOP orthologs ---------------------------------------------------

# Import the human and ortholog data from all HCOP species
hcop_txt_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz"
hcop = read_tsv(hcop_txt_url)

# Remove repeating databases (some are listed multiple times)
hcop = mutate(hcop, support = strsplit(support, ",", fixed = TRUE))
hcop = mutate(hcop, support = map(support, unique))
hcop = mutate(hcop, support = map_chr(support, paste, collapse = ","))

# Keep only the genes found in MSigDB and those in multiple ortholog/homolog databases
msigdbr_orthologs =
hcop %>%
Expand All @@ -116,9 +129,12 @@ msigdbr_orthologs =
) %>%
filter(
human_entrez_gene %in% msigdb_entrez_genes,
num_sources > 1
num_sources > 2
)

# List the number of supporting sources
table(msigdbr_orthologs$num_sources, useNA = "ifany")

# Names and IDs of common species
species_tbl =
tibble(species_id = integer(), species_name = character()) %>%
Expand All @@ -139,14 +155,21 @@ msigdbr_orthologs %>% pull(species_id) %>% unique() %>% sort()
# Add species names
msigdbr_orthologs = inner_join(species_tbl, msigdbr_orthologs, by = "species_id")

# For each gene, only keep the best ortholog (found in the most databases)
# For each human gene, only keep the best ortholog (found in the most databases)
msigdbr_orthologs =
msigdbr_orthologs %>%
select(-species_id) %>%
group_by(human_entrez_gene, species_name) %>%
top_n(1, num_sources) %>%
ungroup()

# For each human gene, ignore ortholog pairs with many orthologs
msigdbr_orthologs =
msigdbr_orthologs %>%
add_count(human_entrez_gene, species_name) %>%
filter(n <= 3) %>%
select(-n)

# Add human genes from the original genesets table to the orthologs table
msigdbr_orthologs =
msigdbr_orthologs %>%
Expand All @@ -155,13 +178,28 @@ msigdbr_orthologs =
human_entrez_gene, human_gene_symbol,
species_name, entrez_gene, gene_symbol, sources, num_sources
) %>%
arrange(species_name, human_gene_symbol) %>%
arrange(human_gene_symbol, human_entrez_gene, species_name) %>%
distinct()

# Show the orthologs summary stats
hcop %>%
inner_join(species_tbl, by = c("ortholog_species" = "species_id")) %>%
group_by(species_name) %>%
summarize(n_distinct(ortholog_species_symbol))
msigdbr_orthologs %>%
group_by(species_name) %>%
summarize(n_distinct(human_gene_symbol), n_distinct(gene_symbol), max(num_sources))

# Prepare package ---------------------------------------------------------

# Check the size of final tables
format(object.size(msigdbr_genes), units = "Mb")
format(object.size(msigdbr_genesets), units = "Mb")
format(object.size(msigdbr_orthologs), units = "Mb")

# Create package data
use_data(
msigdbr_genes,
msigdbr_genesets,
msigdbr_orthologs,
internal = TRUE,
Expand Down
25 changes: 15 additions & 10 deletions man/msigdbr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/msigdbr_show_species.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 9ed2768

Please sign in to comment.