In [1]:
library(plyr, verbose  = FALSE)
library(tidyverse, verbose  = FALSE)
library(tidyjson, verbose  = FALSE)
library(readxl, verbose  = FALSE)
#library(disgenet2r, verbose  = FALSE)
#library(getPass, verbose  = FALSE)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr[39m::[32mdesc()[39m      masks [34mplyr[39m::desc()
[31m✖[39m [34mdplyr[39m::[32mfailwith()[39m  masks [34mplyr[39m::failwith()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39

In [2]:
`%ni%` <- Negate(`%in%`)

In [3]:
genesets_colnames = c("Source", "GeneSetID", "GeneSetName", "GeneList", "n_Genes")
genesets_MAT = matrix(nrow = 0, ncol = length(genesets_colnames))
colnames(genesets_MAT) = genesets_colnames
genesets_DF = as_tibble(genesets_MAT) %>%
  mutate(across(everything(), as.character))
genesets_DF$n_Genes = as.integer(genesets_DF$n_Genes)
genesets_empty_DF = genesets_DF

### NeST

In [4]:
NeST_DF = read_delim("data/Omics_data/NeST__IAS_clixo_hidef_Nov17.edges", delim = "\t", col_names = FALSE)  %>%
subset(X3 == "gene") %>%
group_by(X1) %>% 
summarize(GeneList = paste0(X2, collapse = " ")) %>%
rowwise() %>%
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]])) %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "NeST",
      GeneSetID = X1,
      GeneSetName = X1) %>%
select(-X1) %>%
select(genesets_colnames)

[1mRows: [22m[34m39463[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): X1, X2, X3

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“[1m[22mUsing an external vector in selections was deprecated in tidyselect 1.1.0.
[36mℹ[39m Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(genesets_colnames)

  # Now:
  data %>% select(all_of(genesets_colnames))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.”


### L1000

In [5]:
L1000_all_DF = read_delim("data/Omics_data/L1000__gene_attribute_edges.txt", delim = "\t", skip = 1) %>%
group_by(`Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`) %>%
summarize(GeneList = paste0(GeneSym, collapse = " ")) %>%
rowwise() %>%
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]]))  %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "L1000",
       GeneSetID = `Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`,
       GeneSetName = `Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit`) %>%
select(-"Perturbation ID_Perturbagen_Cell Line_Time_Time Unit_Dose_Dose Unit") %>%
separate(col =  "GeneSetName",
         into = c("Drug", "Reagent", "Cellline", "duration", "duration_unit", "dosage", "dosage_unit"),
         sep = "_",
        remove = FALSE) 

[1m[22mNew names:
[36m•[39m `NA` -> `NA...2`
[36m•[39m `NA` -> `NA...5`
[36m•[39m `NA` -> `NA...6`
[1mRows: [22m[34m4189677[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): GeneSym, NA...2, Perturbation ID_Perturbagen_Cell Line_Time_Time Un...
[32mdbl[39m (3): GeneID, NA...6, weight

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“[1m[22mExpected 7 pieces. Additional pieces discarded in 85 rows [2543, 2544, 2545,
2546, 2547, 2548, 2549, 2550, 2551, 2552, 3579, 3580, 3581, 3582, 3583, 3584,
3585, 3586, 4223, 4224, ...].”


In [6]:
dim(L1000_all_DF)

In [7]:
L1000_sep_count_DF = L1000_all_DF %>%
group_by(Reagent, Cellline, duration,duration_unit, dosage, dosage_unit) %>%
summarise(n_genesets = n()) %>%
arrange(desc(n_genesets))

[1m[22m`summarise()` has grouped output by 'Reagent', 'Cellline', 'duration',
'duration_unit', 'dosage'. You can override using the `.groups` argument.


In [8]:
L1000_sep_count_DF

Reagent,Cellline,duration,duration_unit,dosage,dosage_unit,n_genesets
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
-666,MCF7,6.0,h,10.0,um,439
-666,A375,6.0,h,10.0,um,347
-666,MCF7,24.0,h,10.0,um,336
-666,HT29,6.0,h,10.0,um,324
-666,PC3,6.0,h,10.0,um,239
-666,VCAP,6.0,h,10.0,um,232
-666,PC3,24.0,h,10.0,um,227
-666,HEPG2,6.0,h,10.0,um,212
-666,VCAP,24.0,h,10.0,um,207
-666,ASC,24.0,h,10.0,um,139


In [9]:
write_delim(x = L1000_sep_count_DF, file = "data/L1000_sep_count_DF.txt", delim = "\t")

In [10]:
dim(L1000_sep_count_DF)

In [11]:
L1000_DF = L1000_all_DF %>% 
subset(Cellline == "MCF7"& duration == "6.0"& dosage == "10.0")  %>%
select(genesets_colnames)

In [12]:
dim(L1000_DF)

In [13]:
head(L1000_DF)

Source,GeneSetID,GeneSetName,GeneList,n_Genes
<chr>,<chr>,<chr>,<chr>,<int>
L1000,BRD-A00546892_-666_MCF7_6.0_h_10.0_um,BRD-A00546892_-666_MCF7_6.0_h_10.0_um,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 ORC1 PEG10 RCBTB2 S100A8 TMEM45A TWIST1 VIM CLU COCH CST3 DNAJB1 FAM46A LMO2 MAOB NRN1 SLC27A2 ZBTB20,24
L1000,BRD-A00993607_ALPRENOLOL_MCF7_6.0_h_10.0_um,BRD-A00993607_ALPRENOLOL_MCF7_6.0_h_10.0_um,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD44 CDC42EP3 CHST1 COLEC12 DENND1B EMP1 EXT1 FAM129A FZD6 GLIPR1 IER5 IGF2BP3 KLRB1 LPAR6 NT5E PLIN2 PLXNA1 POSTN PPP1R3C QPCT SLC5A3 SPON2 TMEM158 TNFSF10 TRIB2 AGR2 CCL19 COMP CRK CXCL13 FAM46A GOLGA8A GRN GSTM3 HPRT1 MYO5C NGFRAP1 PLA2G16 SERPINA3 SLC5A6,47
L1000,BRD-A00993607_Alprenolol hydrochloride_MCF7_6.0_h_10.0_um,BRD-A00993607_Alprenolol hydrochloride_MCF7_6.0_h_10.0_um,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HEBP2 MAN2A1 NELL2 PDE8B TBC1D1 TRD TRDV3 TYMP VAMP8 1060P11.3 AKR1B1 ARMCX1 ATF1 CAPN3 CAV1 CAV2 CNN3 CSTA DKK1 DLD EIF4A1 EPB41L2 FADS1 G3BP1 HLA-DPA1 HPRT1 LGALS3BP LHFP LYPLA1 MARCKS MN1 MTUS1 MYH10 NRIP1 OPN3 PIK3R1 PLLP POLR2K PTGER4 RBBP4 RDX THBS1 TMEM255A TNS1 TNS3 TOP2A TRAM1 TSPYL5 UFM1 WFDC2 WSB1,59
L1000,BRD-A01320529_Salmeterol_MCF7_6.0_h_10.0_um,BRD-A01320529_Salmeterol_MCF7_6.0_h_10.0_um,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12 EXOSC4,9
L1000,BRD-A01346607_FLUMETHASONE_MCF7_6.0_h_10.0_um,BRD-A01346607_FLUMETHASONE_MCF7_6.0_h_10.0_um,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD1 NUP93 SOX9 AKAP12 ALCAM CDH3 CTSH GDF15 SUMO2,15
L1000,BRD-A01593789_CHLORMADINONE ACETATE_MCF7_6.0_h_10.0_um,BRD-A01593789_CHLORMADINONE ACETATE_MCF7_6.0_h_10.0_um,1060P11.3 ADH5 ADM ATF3 CASP1 CLEC2B CPA3 CRIM1 DKK3 FEZ1 MECOM MKNK1 NXN PCK1 RGCC S100A8 SPON1 SST TMEM41B CYP1B1 DNAJC12 ETFB FOLR1 GPX3 IGFBP4 MAN1C1 OXA1L PIK3R1 RAD23B RARRES1 RNF167 SORD SPDEF TACSTD2 TBC1D4 TFF1 TNC TSPYL5,38


### Viral Infections

In [14]:
viral_Infections_all_long_DF = read_delim(file = "data//Viral_Infections_gene_attribute_matrix_standardized.txt", delim = "\t", skip = 1) %>%
rename(GeneSym  = `#...1`) %>%
select(c(-2, -3)) %>%
slice(-1) %>%
pivot_longer(c(2:ncol(.))) %>%
subset(abs(value) >= 2) %>%
group_by(name) %>%
summarize(GeneList = paste0(GeneSym, collapse = " ")) %>%
rowwise() %>% 
mutate(n_Genes = length(str_split(string = GeneList, pattern = " ")[[1]]))  %>%
subset(between(x = n_Genes, left = 3, right = 100)) %>%
mutate(Source = "Viral_Infections",
       GeneSetID = name,
       GeneSetName = name) %>%
select(-name) %>%
separate(col = GeneSetID,
         into = c("virus", "duration"),
        sep = "_",
        remove = FALSE) %>%
select(genesets_colnames)

[1m[22mNew names:
[36m•[39m `#` -> `#...1`
[36m•[39m `#` -> `#...2`
[36m•[39m `icSARS CoV_24Hour` -> `icSARS CoV_24Hour...11`
[36m•[39m `icSARS CoV_60Hour` -> `icSARS CoV_60Hour...12`
[36m•[39m `icSARS CoV_54Hour` -> `icSARS CoV_54Hour...13`
[36m•[39m `icSARS CoV_48Hour` -> `icSARS CoV_48Hour...14`
[36m•[39m `icSARS CoV_72Hour` -> `icSARS CoV_72Hour...15`
[36m•[39m `icSARS CoV_36Hour` -> `icSARS CoV_36Hour...16`
[36m•[39m `icSARS CoV_30Hour` -> `icSARS CoV_30Hour...17`
[36m•[39m `icSARS CoV_12Hour` -> `icSARS CoV_12Hour...19`
[36m•[39m `icSARS CoV_7Hour` -> `icSARS CoV_7Hour...21`
[36m•[39m `SARS-CoV MA15_Day2-PFU-10^4` -> `SARS-CoV MA15_Day2-PFU-10^4...29`
[36m•[39m `SARS-CoV MA15_Day2` -> `SARS-CoV MA15_Day2...31`
[36m•[39m `SARS-CoV MA15_Day2` -> `SARS-CoV MA15_Day2...32`
[36m•[39m `SARS-CoV MA15_Day7-C57BL6` -> `SARS-CoV MA15_Day7-C57BL6...34`
[36m•[39m `SARS-CoV MA15_Day7-PFU-10^5` -> `SARS-CoV MA15_Day7-PFU-10^5...36`
[36m•[39m `SARS-CoV MA15_D

In [15]:
# viral_Infections_all_count_DF = viral_Infections_all_long_DF %>%
# group_by(duration) %>%
# summarise(n_genesets = n()) %>%
# arrange(desc(n_genesets))

# viral_Infections_all_count_DF

In [16]:
dim(viral_Infections_all_long_DF)

In [17]:
(126 + 126 + 48)

In [18]:
genesets_DF = genesets_empty_DF %>%
rbind(head(NeST_DF, 126)) %>%
rbind(head(L1000_DF, 126)) %>%
rbind(head(viral_Infections_all_long_DF, 48)) 

In [19]:
genesets_DF %>% pull(Source) %>% table()

.
           L1000             NeST Viral_Infections 
             126              126               48 

In [20]:
write_delim(x = genesets_DF,file = "data/omics_revamped.txt" , delim = "\t")