In [2]:
library(plyr, verbose  = FALSE)
library(tidyverse, verbose  = FALSE)
library(tidyjson, verbose  = FALSE)
library(readxl, verbose  = FALSE)
#library(disgenet2r, verbose  = FALSE)
#library(getPass, verbose  = FALSE)

In [2]:
`%ni%` <- Negate(`%in%`)

In [3]:
genesets_colnames = c("Source", "GeneSetID", "GeneSetName", "GeneList", "n_Genes")
genesets_MAT = matrix(nrow = 0, ncol = length(genesets_colnames))
colnames(genesets_MAT) = genesets_colnames
genesets_DF = as_tibble(genesets_MAT) %>%
  mutate(across(everything(), as.character))
genesets_DF$n_Genes = as.integer(genesets_DF$n_Genes)
genesets_empty_DF = genesets_DF

### NeST

In [4]:
NeST_DF = read_delim("data/Omics_data/NeST__IAS_clixo_hidef_Nov17.edges", delim = "\t", col_names = FALSE)  %>%
subset(X3 == "gene") %>%
group_by(X1) %>% 
summarize(GeneList = paste0(X2, collapse = " ")) %>%
rowwise() %>%
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]])) %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "NeST",
      GeneSetID = X1,
      GeneSetName = X1) %>%
select(-X1) %>%
select(genesets_colnames)

[1mRows: [22m[34m39463[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): X1, X2, X3

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“[1m[22mUsing an external vector in selections was deprecated in tidyselect 1.1.0.
[36mℹ[39m Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(genesets_colnames)

  # Now:
  data %>% select(all_of(genesets_colnames))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.”


### L1000

In [5]:
L1000_all_DF = read_delim("data/Omics_data/L1000__gene_attribute_edges.txt", delim = "\t", skip = 1) %>%
group_by(`Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`) %>%
summarize(GeneList = paste0(GeneSym, collapse = " ")) %>%
rowwise() %>%
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]]))  %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "L1000",
       GeneSetID = `Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`,
       GeneSetName = `Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`) %>%
select(-"Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit") %>%
separate(col =  "GeneSetName",
         into = c("Drug", "Reagent", "Cellline", "duration", "duration_unit", "dosage", "dosage_unit"),
         sep = "_",
        remove = FALSE) 

[1m[22mNew names:
[36m•[39m `NA` -> `NA...2`
[36m•[39m `NA` -> `NA...5`
[36m•[39m `NA` -> `NA...6`
[1mRows: [22m[34m4189677[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): GeneSym, NA...2, Perturbation ID_Perturbagen_Cell Line_Time_Time Un...
[32mdbl[39m (3): GeneID, NA...6, weight

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“[1m[22mExpected 7 pieces. Additional pieces discarded in 85 rows [2543, 2544, 2545,
2546, 2547, 2548, 2549, 2550, 2551, 2552, 3579, 3580, 3581, 3582, 3583, 3584,
3585, 3586, 4223, 4224, ...].”


In [6]:
dim(L1000_all_DF)

In [7]:
L1000_sep_count_DF = L1000_all_DF %>%
group_by(Reagent, Cellline, duration,duration_unit, dosage, dosage_unit) %>%
summarise(n_genesets = n()) %>%
arrange(desc(n_genesets))

[1m[22m`summarise()` has grouped output by 'Reagent', 'Cellline', 'duration',
'duration_unit', 'dosage'. You can override using the `.groups` argument.


In [9]:
write_delim(x = L1000_sep_count_DF, file = "data/L1000_sep_count_DF.txt", delim = "\t")

In [10]:
dim(L1000_sep_count_DF)

In [11]:
L1000_DF = L1000_all_DF %>% 
subset(Cellline == "MCF7"& duration == "6.0"& dosage == "10.0")  %>%
select(genesets_colnames)

In [12]:
dim(L1000_DF)

### Viral Infections

In [14]:
viral_Infections_all_long_DF = read_delim(file = "data/Omics_data/Viral_Infections_gene_attribute_matrix_standardized.txt", delim = "\t", skip = 1) %>%
rename(GeneSym  = `#...1`) %>%
select(c(-2, -3)) %>%
slice(-1) %>%
pivot_longer(c(2:ncol(.))) %>%
subset(abs(value) >= 2) %>%
group_by(name) %>%
summarize(GeneList = paste0(GeneSym, collapse = " ")) %>%
rowwise() %>% 
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]]))  %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "Viral_Infections",
       GeneSetID = name,
       GeneSetName = name) %>%
select(-name) %>%
separate(col = GeneSetID,
         into = c("virus", "duration"),
        sep = "_",
        remove = FALSE) %>%
select(genesets_colnames)

[1m[22mNew names:
[36m•[39m `#` -> `#...1`
[36m•[39m `#` -> `#...2`
[36m•[39m `icSARS CoV_24Hour` -> `icSARS CoV_24Hour...11`
[36m•[39m `icSARS CoV_60Hour` -> `icSARS CoV_60Hour...12`
[36m•[39m `icSARS CoV_54Hour` -> `icSARS CoV_54Hour...13`
[36m•[39m `icSARS CoV_48Hour` -> `icSARS CoV_48Hour...14`
[36m•[39m `icSARS CoV_72Hour` -> `icSARS CoV_72Hour...15`
[36m•[39m `icSARS CoV_36Hour` -> `icSARS CoV_36Hour...16`
[36m•[39m `icSARS CoV_30Hour` -> `icSARS CoV_30Hour...17`
[36m•[39m `icSARS CoV_12Hour` -> `icSARS CoV_12Hour...19`
[36m•[39m `icSARS CoV_7Hour` -> `icSARS CoV_7Hour...21`
[36m•[39m `SARS-CoV MA15_Day2-PFU-10^4` -> `SARS-CoV MA15_Day2-PFU-10^4...29`
[36m•[39m `SARS-CoV MA15_Day2` -> `SARS-CoV MA15_Day2...31`
[36m•[39m `SARS-CoV MA15_Day2` -> `SARS-CoV MA15_Day2...32`
[36m•[39m `SARS-CoV MA15_Day7-C57BL6` -> `SARS-CoV MA15_Day7-C57BL6...34`
[36m•[39m `SARS-CoV MA15_Day7-PFU-10^5` -> `SARS-CoV MA15_Day7-PFU-10^5...36`
[36m•[39m `SARS-CoV MA15_D

In [15]:
# viral_Infections_all_count_DF = viral_Infections_all_long_DF %>%
# group_by(duration) %>%
# summarise(n_genesets = n()) %>%
# arrange(desc(n_genesets))

# viral_Infections_all_count_DF

In [18]:
genesets_DF = genesets_empty_DF %>%
rbind(head(NeST_DF, 126)) %>%
rbind(head(L1000_DF, 126)) %>%
rbind(head(viral_Infections_all_long_DF, 48)) 

In [19]:
genesets_DF %>% pull(Source) %>% table()

.
           L1000             NeST Viral_Infections 
             126              126               48 

In [20]:
write_delim(x = genesets_DF,file = "data/omics_revamped.txt" , delim = "\t")