In [1]:
library(plyr, quietly = TRUE)
library(tidyverse, quietly = TRUE)
library(ggpubr)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr[39m::[32mdesc()[39m      masks [34mplyr[39m::desc()
[31m✖[39m [34mdplyr[39m::[32mfailwith()[39m  masks [34mplyr[39m::failwith()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39

In [2]:
LLM_score_thresh = 0.8
enrichr_adj_pval_thresh = 0.05
enrichr_JI_thresh = 0.1

In [3]:
LLM_genes_DF = read_delim(file = "data/omics_revamped_LLM_Enrichr_simVals_DF.tsv", delim = "\t") %>%
mutate(enrichr_adj_pVal = `Adjusted P-value`)   


[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m11310[39m [1mColumns: [22m[34m20[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (12): Source, GeneSetID, GeneSetName, GeneList, LLM Name, LLM Analysis, ...
[32mdbl[39m  (8): ...1, Unnamed: 0, n_Genes, Score, Rank, P-value, Adjusted P-value,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
dim(LLM_genes_DF)

In [5]:
get_JI = function(GeneList, enrichr_genes){
    # assume separation is the same
    geneSetGenes = str_split(string = GeneList, pattern = " ")[[1]]
    enrichRGenes = str_split(string = enrichr_genes, pattern = " ")[[1]]
    JI = length(intersect(geneSetGenes, enrichRGenes))/ length(union(geneSetGenes, enrichRGenes))
    
    return(JI)
    }   

In [6]:
## Unified function to filter and select the appropriate row
filter_and_select <- function(df, method = "Best APV") {
    if (method == "Best JI") {
        # used in original revised paper
        filtered <- df %>%
            slice(which.max(enrichr_JI))
    }
    
    if (method == "Best APV") {
        filtered <- df %>%
            filter((enrichr_JI >= enrichr_JI_thresh) & (enrichr_adj_pVal <= enrichr_adj_pval_thresh))
    }
    
    ## For any method if no term meets JI and APV then simply select the one with minimum APV
    if (nrow(filtered) > 0) {
        return(filtered %>% slice(which.min(enrichr_adj_pVal)))
            
    } else {
        return(df %>% slice(which.min(enrichr_adj_pVal)))
    }
}

In [7]:
## GET JI
LLM_genes_DF = LLM_genes_DF %>%
rowwise() %>%
mutate(enrichr_JI = get_JI(GeneList, GO_term_genes))

In [8]:
## Select term based on criteria
LLM_genes_selectedTerms_DF <- LLM_genes_DF %>%
  group_by(Source, GeneSetID, GeneSetName, GeneList) %>%
  do(filter_and_select(.,"Best APV" )) %>%
  ungroup()


In [9]:
dim(LLM_genes_selectedTerms_DF)

In [10]:
## Rename column and apply "success" condition
LLM_genes_selectedTerms_DF = LLM_genes_selectedTerms_DF %>%
rowwise() %>%
mutate(
    LLM_success_TF = ifelse(Score >= LLM_score_thresh, TRUE, FALSE),
    enrichr_success_TF = ifelse(!((enrichr_adj_pVal > enrichr_adj_pval_thresh) | (enrichr_JI < enrichr_JI_thresh)), TRUE, FALSE))
        
    

# NOTE: the score will be selected in 6A based on curve

In [11]:
dim(LLM_genes_selectedTerms_DF)

In [12]:
write_delim(x = LLM_genes_selectedTerms_DF, file = "data/omics_revamped_LLM_Enrichr_simVals_failure_240625_maxAPV_DF.tsv", delim = "\t")

In [13]:
LLM_genes_selectedTerms_DF = LLM_genes_selectedTerms_DF %>%
mutate(enrichr_success_TF_0.1 = ifelse(((enrichr_adj_pVal > enrichr_adj_pval_thresh) | (enrichr_JI < 0.1)), FALSE, TRUE))

In [14]:
table(LLM_genes_selectedTerms_DF[,c("enrichr_success_TF_0.1", "LLM_success_TF")])


                      LLM_success_TF
enrichr_success_TF_0.1 FALSE TRUE
                 FALSE   163   97
                 TRUE      4   36

In [None]:
LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                  36      4     40
False                 97    163    260
Total                133    167    300