In [1]:
library(singscore)
library(tidyverse)
library(ggplot2)
library(msigdbr)
library(gprofiler2)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# Set random seed so this part is reproducible
# https://www.random.org/ 2023-08-09
set.seed(3866) 

In [3]:
hallmark_gene_sets = msigdbr(species = "Homo sapiens", category = "H")

In [4]:
tpm <- read.table("../../../../data/expression/processed/train/V4/IO-Atlas-NSCLC-TPM-TRAIN-2023-08-10-V4.tsv", 
                  sep='\t', 
                  row.names=1, 
                  header=T)

labels <- read.table("../../../../data/expression/processed/train/V4/IO-Atlas-NSCLC-LABEL-TRAIN-2023-08-10-V4.tsv", 
                  sep='\t', 
                  row.names=1, 
                  header=T)

rnk <- rankGenes(tpm)

scores <- data.frame()

for (name in unique(hallmark_gene_sets$gs_name))
    {
        gs <- hallmark_gene_sets %>% filter(gs_name == name)
    
        score <- simpleScore(rnk, 
                             upSet = unique(gs$ensembl_gene))
    
        scores[row.names(score), name] <- score$TotalScore
    }

write.table(scores, 
            "../../../../data/enrichment/IO-Atlas-NSCLC-TPM-TRAIN-2023-08-10-V4-hallmark-enrichment.tsv",
            sep='\t')

“15 genes missing: ENSG00000182035, ENSG00000181092, ENSG00000285070, ENSG00000242131, ENSG00000273607, ENSG00000176194, ENSG00000275003, ENSG00000261698, ENSG00000285482, ENSG00000282853, ENSG00000174697, ENSG00000283887, ENSG00000285121, ENSG00000285387, ENSG00000050628”
“150 genes missing: ENSG00000273686, ENSG00000204364, ENSG00000206372, ENSG00000226560, ENSG00000231543, ENSG00000235017, ENSG00000235696, ENSG00000164326, ENSG00000275824, ENSG00000277943, ENSG00000274233, ENSG00000108688, ENSG00000288145, ENSG00000197561, ENSG00000277571, ENSG00000206505, ENSG00000223980, ENSG00000224320, ENSG00000227715, ENSG00000229215, ENSG00000231834, ENSG00000235657, ENSG00000239463, ENSG00000241394, ENSG00000242361, ENSG00000242685, ENSG00000243189, ENSG00000243215, ENSG00000243719, ENSG00000226264, ENSG00000234154, ENSG00000239329, ENSG00000241296, ENSG00000241674, ENSG00000242092, ENSG00000242386, ENSG00000206292, ENSG00000230141, ENSG00000231558, ENSG00000232957, ENSG00000232962, ENSG00000

In [5]:
tpm <- read.table("../../../../data/expression/processed/test/V4/IO-Atlas-NSCLC-TPM-TEST-2023-08-10-V4.tsv", 
                  sep='\t', 
                  row.names=1, 
                  header=T)

labels <- read.table("../../../../data/expression/processed/test/V4/IO-Atlas-NSCLC-LABEL-TEST-2023-08-10-V4.tsv", 
                  sep='\t', 
                  row.names=1, 
                  header=T)

rnk <- rankGenes(tpm)

scores <- data.frame()

for (name in unique(hallmark_gene_sets$gs_name))
    {
        gs <- hallmark_gene_sets %>% filter(gs_name == name)
    
        score <- simpleScore(rnk, 
                             upSet = unique(gs$ensembl_gene))
    
        scores[row.names(score), name] <- score$TotalScore
    }

write.table(scores, 
            "../../../../data/enrichment/IO-Atlas-NSCLC-TPM-TEST-2023-08-10-V4-hallmark-enrichment.tsv",
            sep='\t')

“15 genes missing: ENSG00000182035, ENSG00000181092, ENSG00000285070, ENSG00000242131, ENSG00000273607, ENSG00000176194, ENSG00000275003, ENSG00000261698, ENSG00000285482, ENSG00000282853, ENSG00000174697, ENSG00000283887, ENSG00000285121, ENSG00000285387, ENSG00000050628”
“150 genes missing: ENSG00000273686, ENSG00000204364, ENSG00000206372, ENSG00000226560, ENSG00000231543, ENSG00000235017, ENSG00000235696, ENSG00000164326, ENSG00000275824, ENSG00000277943, ENSG00000274233, ENSG00000108688, ENSG00000288145, ENSG00000197561, ENSG00000277571, ENSG00000206505, ENSG00000223980, ENSG00000224320, ENSG00000227715, ENSG00000229215, ENSG00000231834, ENSG00000235657, ENSG00000239463, ENSG00000241394, ENSG00000242361, ENSG00000242685, ENSG00000243189, ENSG00000243215, ENSG00000243719, ENSG00000226264, ENSG00000234154, ENSG00000239329, ENSG00000241296, ENSG00000241674, ENSG00000242092, ENSG00000242386, ENSG00000206292, ENSG00000230141, ENSG00000231558, ENSG00000232957, ENSG00000232962, ENSG00000