# Gene Set Enrichment Analysis (GSEA)
# Phase 1 vs. Phase 1

using [`GSEA`](https://rdrr.io/bioc/clusterProfiler/man/GSEA.html) and [`enrichr`](https://rdrr.io/bioc/clusterProfiler/man/enricher.html) commands in the [clusterProfiler](https://github.com/YuLab-SMU/clusterProfiler) R package

## 0. load libraries

In [35]:
library(tidyverse)
library(clusterProfiler)
library(enrichplot)
library(DOSE)

DOSE v3.28.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609




## 1. read in CSVs

read in GO annotation data

In [9]:
# col1 = gene ID
# col2 = GO ID 
gene2go <- read.csv('/work/pi_sarah_gignouxwolfsohn_uml_edu/julia_mcdonough_student_uml_edu/ref_files/annotations/geneGO.txt', sep = '\t')
head(gene2go)

Unnamed: 0_level_0,gene,Gene.Ontology.IDs
Unnamed: 0_level_1,<chr>,<chr>
1,LOC111126949,GO:0005042; GO:0005737; GO:0008233; GO:0016020; GO:0043123; GO:0046330
2,LOC111112434,GO:0000981; GO:0003700; GO:0005634; GO:0016607; GO:0043565; GO:0045944
3,LOC111120752,GO:0004750; GO:0005829; GO:0005975; GO:0006098; GO:0009052; GO:0042802; GO:0042803; GO:0046872; GO:0070062
4,LOC111105685,GO:0003682; GO:0004518; GO:0005634; GO:0035098; GO:0035102; GO:0040029; GO:0046872
5,LOC111113860,GO:0004062; GO:0005737; GO:0006068; GO:0006805; GO:0008146; GO:0009812; GO:0030855; GO:0042403; GO:0050427; GO:0051923
6,LOC111109550,GO:0004062; GO:0005737; GO:0005764; GO:0008146; GO:0051923


re-format for correct input for `GSEA()` - two columns, one for GO term and one for gene ID

In [27]:
term2gene <- gene2go %>%
  mutate(GO_terms = strsplit(Gene.Ontology.IDs, ",\\s*|;\\s*|`")) %>%  # Split by comma, semicolon, or backtick
  unnest(GO_terms) %>%
  filter(grepl("^GO:", GO_terms)) %>%  # Keep only valid GO terms
  select(term = GO_terms, gene = gene)



class(term2gene)
str(term2gene)
head(term2gene)

tibble [223,103 × 2] (S3: tbl_df/tbl/data.frame)
 $ term: chr [1:223103] "GO:0005042" "GO:0005737" "GO:0008233" "GO:0016020" ...
 $ gene: chr [1:223103] "LOC111126949" "LOC111126949" "LOC111126949" "LOC111126949" ...


term,gene
<chr>,<chr>
GO:0005042,LOC111126949
GO:0005737,LOC111126949
GO:0008233,LOC111126949
GO:0016020,LOC111126949
GO:0043123,LOC111126949
GO:0046330,LOC111126949


read in *all* genes from DESeq output (not just DEGs)

In [63]:
##### comparisons vs. control
# warm vs. control
w.c <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.warm_v_control.csv')

# both vs. control
b.c <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.both_v_control.csv')

# hyp vs. control
h.c <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.hyp_v_control.csv')

##### other comparisons
# hyp vs. both
h.b <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.hyp_v_both.csv')

# warm vs. both
w.b <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.warm_v_both.csv')

# warm vs. hyp
w.h <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase1_v_phase1/deseq_res_files/p1.warm_v_hyp.csv')

## 2. Formatting CSVs
input to `gsea()` is an order ranked geneList - I'm going to rank by log2FoldChange

In [64]:
# data needs to be a numeric vector
# create function for above cell block so can easily apply to other data
format_geneList <- function(df) {
    geneList <- df$log2FoldChange 
    # Set gene IDs as names
    names(geneList) <- df$X
    
    # Sort in decreasing order
    geneList <- sort(geneList, decreasing = TRUE)

    return(geneList)
    }

In [65]:
w.c_geneList <- format_geneList(w.c)
h.c_geneList <- format_geneList(h.c)
b.c_geneList <- format_geneList(b.c)
h.b_geneList <- format_geneList(h.b)
w.b_geneList <- format_geneList(w.b)
w.h_geneList <- format_geneList(w.h)

# check that it worked
class(w.c_geneList)
head(w.c_geneList)

## 3. Run GSEA

In [66]:
# warm vs control
gsea_res_w.c <- GSEA(
    geneList = w.c_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05)
as.data.frame(gsea_res_w.c)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (1.24% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
no term enriched under specific pvalueCutoff...



ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>


In [67]:
# both vs control
gsea_res_b.c <- GSEA(
    geneList = b.c_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05,
    verbose = TRUE)
as.data.frame(gsea_res_b.c)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (1.34% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
leading edge analysis...

done...



Unnamed: 0_level_0,ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue,rank,leading_edge,core_enrichment
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
GO:0050770,GO:0050770,GO:0050770,19,-0.9982205,-1.441147,5.635311e-06,0.02061397,0.02061397,22,"tags=16%, list=0%, signal=16%",LOC111130881/LOC111125415/LOC111125416


In [68]:
# hypoxic vs control
gsea_res_h.c <- GSEA(
    geneList = h.c_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05,
    verbose = TRUE)
as.data.frame(gsea_res_h.c)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (1.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
no term enriched under specific pvalueCutoff...



ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>


In [70]:
# hypoxic vs both
gsea_res_h.b <- GSEA(
    geneList = h.b_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05,
    verbose = TRUE)
as.data.frame(gsea_res_h.b)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (0.77% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
leading edge analysis...

done...



Unnamed: 0_level_0,ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue,rank,leading_edge,core_enrichment
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>
GO:0007274,GO:0007274,GO:0007274,29,0.9989389,1.396448,1.240006e-05,0.04535941,0.04535941,10,"tags=3%, list=0%, signal=3%",LOC111120179


In [71]:
# warm vs both
gsea_res_w.b <- GSEA(
    geneList = w.b_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05,
    verbose = TRUE)
as.data.frame(gsea_res_w.b)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (0.72% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
no term enriched under specific pvalueCutoff...



ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>


In [72]:
# warm vs hypoxic
gsea_res_w.h <- GSEA(
    geneList = w.h_geneList,
    TERM2GENE = term2gene, 
    pvalueCutoff = 0.05,
    verbose = TRUE)
as.data.frame(gsea_res_w.h)

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (1.48% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
leading edge analysis...

done...



Unnamed: 0_level_0,ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue,rank,leading_edge,core_enrichment
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>
GO:0004748,GO:0004748,GO:0004748,75,0.9643563,2.184434,1.228681e-05,0.02247258,0.02247258,33,"tags=1%, list=0%, signal=1%",LOC111108941
GO:0009263,GO:0009263,GO:0009263,75,0.9643563,2.184434,1.228681e-05,0.02247258,0.02247258,33,"tags=1%, list=0%, signal=1%",LOC111108941
