In [None]:
library(plyr, quietly = TRUE)
library(tidyverse,  quietly = TRUE)
library(tidyjson,  quietly = TRUE)
library(readxl)

In [None]:
`%ni%` <- Negate(`%in%`)

In [None]:
# get name genes number  source

In [None]:
genesets_colnames = c("Source", "GeneSetID", "GeneSetName", "GeneList", "n_Genes")
genesets_MAT = matrix(nrow = 0, ncol = length(genesets_colnames))
colnames(genesets_MAT) = genesets_colnames
genesets_DF = as_tibble(genesets_MAT) %>%
  mutate(across(everything(), as.character))
genesets_DF$n_Genes = as.integer(genesets_DF$n_Genes)

In [None]:
genesets_empty_DF = genesets_DF

### MSigDB

In [None]:
msigdb_path = "data/human_geneSets/"

In [None]:
# read in the MSigDB .json file
MSigDB_hallmark = read_json("data/h.all.v2023.1.Hs.json", format = "json")[[2]][[1]]

In [None]:
MSigDB_geneSets = names(MSigDB_hallmark)

In [None]:
for (MSigDB_geneSet in MSigDB_geneSets){
    MSigDB_geneSet_genes = MSigDB_hallmark[[MSigDB_geneSet]]$geneSymbols %>% unlist()
    MSigDB_geneSet_n_Genes = length(MSigDB_geneSet_genes)
    genesets_DF = genesets_DF %>% 
    add_row(tibble_row(Source = "MSigDB",
            GeneSetID = MSigDB_geneSet,           
            GeneSetName = MSigDB_geneSet, 
            GeneList = str_c(MSigDB_geneSet_genes,  collapse = " "),
            n_Genes = MSigDB_geneSet_n_Genes))
    }

In [None]:
genesets_DF %>% subset(n_Genes < 200) %>% nrow()

In [None]:
genesets_DF %>% subset(n_Genes <= 100) %>% nrow()

In [None]:
dim(genesets_DF)

In [None]:
genesets_DF$n_Genes %>% max()

### NeST

In [None]:
NeST_systems_Orig = read_delim("data/NeST_table.txt", delim = "\t") %>% pull(`NEST ID`)

In [None]:
NeST_All_DF = read_delim(file = "data/NeST_table_All.csv", delim = "," )

In [None]:
NeST_Orig_DF = NeST_All_DF %>%
subset(`NEST ID` %in% NeST_systems_Orig) %>%
subset(Size_All <= 200)

In [None]:
setdiff(NeST_systems_Orig, NeST_Orig_DF$`NEST ID`)

In [None]:
n_toSample = 50 - nrow(NeST_Orig_DF)

In [None]:
NeST_added_DF = NeST_All_DF %>%
subset(`NEST ID` %ni% NeST_systems_Orig) %>%
subset(Size_All <= 200) %>%
subset(str_detect(name_new, "NEST", negate = TRUE)) %>%
slice_sample(n = n_toSample) 

In [None]:
write_delim(x = NeST_added_DF, file = "data/NeST_added_DF.txt", delim = "\t")

In [None]:
NeST_added_DF

In [None]:
NeST_toUse_DF = rbind(NeST_Orig_DF, NeST_added_DF) %>%
select(c("NEST ID", "name_new", "All_Genes", "Size_All"))  %>%
rowwise() %>%
mutate(Source  = "NeST", 
       All_Genes = str_replace_all(string = All_Genes, pattern  = ",", replacement =  " ")) %>%
rename(GeneSetID = `NEST ID`,
       GeneSetName = name_new, 
       GeneList = All_Genes, 
       n_Genes = Size_All)

In [None]:
genesets_DF = genesets_DF %>% rbind(NeST_toUse_DF)

In [None]:
dim(genesets_DF)

In [None]:
head(genesets_DF)

### Gene expression

In [None]:
# https://doi.org/10.1016/j.cell.2022.05.013

In [None]:
geneExpression_DF = read_xlsx(path = "data/1-s2.0-S0092867422005979-mmc3.xlsx", sheet = "gene expression clusters")

In [None]:
geneExpression_DF = read_xlsx(path = "data/1-s2.0-S0092867422005979-mmc3.xlsx", sheet = "gene expression clusters") %>%
subset(!is.na(manual_annotation))  %>%
select(c("members", "manual_annotation")) %>%
rowwise() %>%
mutate(Source  = "Gene Expression", 
       members = str_replace_all(string = members, pattern  = ",", replacement =  " "),
       n_Genes = length(str_split(string = members, pattern = " ")[[1]])) %>%
rename(GeneSetName = manual_annotation, 
       GeneList = members) %>%
mutate(GeneSetID =  GeneSetName) %>%
subset(n_Genes <= 200)


In [None]:
dim(genesets_DF)

In [None]:
genesets_DF = genesets_DF %>% rbind(geneExpression_DF)

In [None]:
dim(genesets_DF)

### Perturbation

In [None]:
perturbation_DF = read_xlsx(path = "data/1-s2.0-S0092867422005979-mmc3.xlsx", sheet = "perturbation clusters") %>%
subset(!is.na(manual_annotation))  %>%
select(c("members", "manual_annotation")) %>%
rowwise() %>%
mutate(Source  = "Perturbation", 
       members = str_replace_all(string = members, pattern  = ",", replacement =  " "),
       n_Genes = length(str_split(string = members, pattern = " ")[[1]])) %>%
rename(GeneSetName = manual_annotation, 
       GeneList = members)%>%
mutate(GeneSetID =  GeneSetName) %>%
subset(n_Genes <= 200)

In [None]:
dim(perturbation_DF)

In [None]:
genesets_DF = genesets_DF %>% rbind(perturbation_DF)

In [None]:
dim(genesets_DF)

In [None]:
250 - 173 

### Disease

In [None]:
library(disgenet2r)
library(getPass)

In [None]:
pass = getPass::getPass("Enter the password: ")

In [None]:
disgenet_api_key <- get_disgenet_api_key(
                  email = "salkhairy@ucsd.edu", 
                  password = pass )

In [None]:
Sys.setenv(DISGENET_API_KEY= disgenet_api_key)

In [None]:
diseaseId_All_DF = read_delim("data/disease_associations.tsv", delim = "\t")


In [None]:
diseaseId_All_list_names = diseaseId_All_DF$diseaseName
names(diseaseId_All_list_names) = diseaseId_All_DF$diseaseId

In [None]:
diseaseId_All_list_names[1:4]

In [None]:
# Read disease list
diseaseId_All_list = diseaseId_All_DF %>%
subset(diseaseType == "disease") %>%
subset(between(x = NofGenes, left = 3, right = 100)) %>%
subset(NofPmids > 1) %>%
pull(diseaseId)

In [None]:
diseaseId_All_DF %>% subset(diseaseId %in% diseaseId_All_list) %>% pull("NofGenes") %>% range()

In [None]:
?disease2gene

In [None]:
# 'GWASCAT' to use the NHGRI-EBI GWAS Catalog; 

In [None]:
# Sample 
diseaseId_list = sample(x = diseaseId_All_list, size = 500, replace = FALSE) # not all of them are successful in getting gene list

In [None]:
disease_DF = genesets_empty_DF
an.error.occured <- FALSE


In [None]:
?disease2gene

In [None]:
for (diseaseId_ind in c(1:length(diseaseIbetween))){
    print(diseaseId_ind)
    diseaseId = diseaseId_list[diseaseId_ind]
    diseaseName = diseaseId_All_list_names[diseaseId]
    
    tryCatch( { dis_res <- disease2gene( diseaseId_list[diseaseId_ind], database = "MGD" ) } #CURATED
          , error = function(e) {an.error.occured <<- TRUE} )
    
    if (class(dis_res) == 'character'){next}
        
    geneList = dis_res@qresult %>% pull(gene_symbol)
    
    disease_DF = disease_DF %>% 
    add_row(tibble_row(Source = "Disease",
            GeneSetID = diseaseId,
            GeneSetName = diseaseName, 
            GeneList = str_c(geneList,  collapse = " "),
            n_Genes = length(geneList)))
    }

In [None]:
diseaseId_All_list_names['C0000727']

In [None]:
diseaseId

In [None]:
nrow(disease_DF)

In [None]:
disease_toUse_DF = disease_DF %>% 
subset(between(x = n_Genes, left = 3, right = 100)) %>%
head(n = 60)

In [None]:
disease_toUse_DF

In [None]:
write_delim(x = disease_toUse_DF,file = "data/disease_toUse_DF.txt" , delim = "\t")

In [None]:
dim(disease_toUse_DF)

In [None]:
genesets_DF = genesets_DF %>% rbind(disease_toUse_DF)

In [None]:
dim(disease_toUse_DF)

In [None]:
dim(genesets_DF)

In [None]:
write_delim(x = genesets_DF,file = "data/omics.txt" , delim = "\t")

## Fixing disease names

In [None]:
genesets_fixed_DF  = read_delim(file = "data/omics.txt" , delim = "\t")

In [None]:
genesets_fixed_DF = genesets_fixed_DF %>%
rowwise() %>%
mutate(GeneSetName = ifelse(Source == "Disease", diseaseId_All_list_names[GeneSetID], GeneSetName))

In [None]:
write_delim(x = genesets_fixed_DF,file = "data/omics.txt" , delim = "\t")

In [None]:
genesets_LLM_fixed_DF  = read_delim(file = "data/omics_LLM_DF.tsv" , delim = "\t")

In [None]:
genesets_LLM_fixed_DF = genesets_LLM_fixed_DF %>%
rowwise() %>%
mutate(GeneSetName = ifelse(Source == "Disease", diseaseId_All_list_names[GeneSetID], GeneSetName))

In [None]:
write_delim(x = genesets_LLM_fixed_DF,file = "data/omics_LLM_DF.tsv" , delim = "\t")

### Add additional disease gene sets that are from animal models