# Gene Ontology Analysis
using output from [deseq_p1.v.p1.ipynb](https://github.com/jgmcdonough/CE24_RNA-seq/blob/main/analysis/diff_expression/phase1_v_phase1/deseq_p1.v.p1.ipynb)

following workflow described [here](https://robertslab.github.io/resources/bio-Annotation/) from the Roberts Lab

### 0. load libraries

In [2]:
library(GSEABase)
library(GO.db)
library(knitr)
library(tidyverse)

### 1. Map GO IDs to GOslims
code is taken directly from [Roberts Lab workflow](https://robertslab.github.io/resources/bio-Annotation/)

In [3]:
# Column names corresponding to gene name/ID and GO IDs
GO.ID.column <- "Gene.Ontology.IDs"
gene.ID.column <- "gene_id"

# Relative path or URL to input file
input.file <- "/path/to/file"

##### Official GO info - no need to change #####
goslims_obo <- "goslim_generic.obo"
goslims_url <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo"


In [4]:
# Find GSEAbase installation location
gseabase_location <- find.package("GSEABase")

# Load path to GOslim OBO file
goslim_obo_destintation <- file.path(gseabase_location, "extdata", goslims_obo, fsep = "/")

# Download the GOslim OBO file
download.file(url = goslims_url, destfile = goslim_obo_destintation)

# Loads package files
gseabase_files <- system.file("extdata", goslims_obo, package="GSEABase")


In [7]:
# read in gene file
full.gene.df <- read.csv(file = input.file, header = TRUE, sep = "\t")

str(full.gene.df)

'data.frame':	4114 obs. of  3 variables:
 $ gene_id          : chr  "g21712" "g21713" "g15181" "g15182" ...
 $ uniprot_accession: chr  "P54985" "Q8TDM6" NA NA ...
 $ Gene.Ontology.IDs: chr  "GO:0003755; GO:0005737; GO:0006457; GO:0043231" "GO:0001837; GO:0005737; GO:0005886; GO:0005912; GO:0007165; GO:0008013; GO:0008092; GO:0008285; GO:0014069; GO:"| __truncated__ NA NA ...


In [6]:
# Clean whitespace, filter NA/empty rows, select columns, and split GO terms using column name variables
gene.GO.df <- full.gene.df %>%
  mutate(!!GO.ID.column := str_replace_all(.data[[GO.ID.column]], "\\s*;\\s*", ";")) %>% # Clean up spaces around ";"
  filter(!is.na(.data[[gene.ID.column]]) & !is.na(.data[[GO.ID.column]]) & .data[[GO.ID.column]] != "") %>% 
  select(all_of(c(gene.ID.column, GO.ID.column)))


str(gene.GO.df)

'data.frame':	3295 obs. of  2 variables:
 $ gene_id          : chr  "g21712" "g21713" "g15183" "g7651" ...
 $ Gene.Ontology.IDs: chr  "GO:0003755;GO:0005737;GO:0006457;GO:0043231" "GO:0001837;GO:0005737;GO:0005886;GO:0005912;GO:0007165;GO:0008013;GO:0008092;GO:0008285;GO:0014069;GO:0030011;G"| __truncated__ "GO:0000785;GO:0000978;GO:0000981;GO:0001228;GO:0001657;GO:0001701;GO:0001706;GO:0001707;GO:0003140;GO:0003180;G"| __truncated__ "GO:0005543;GO:0005737;GO:0005768;GO:0005829;GO:0005886;GO:0006897;GO:0007010;GO:0007015;GO:0030100;GO:0030659;G"| __truncated__ ...


In [8]:
# flatten file
flat.gene.GO.df <- gene.GO.df %>% separate_rows(!!sym(GO.ID.column), sep = ";")

str(flat.gene.GO.df)

tibble [42,028 × 2] (S3: tbl_df/tbl/data.frame)
 $ gene_id          : chr [1:42028] "g21712" "g21712" "g21712" "g21712" ...
 $ Gene.Ontology.IDs: chr [1:42028] "GO:0003755" "GO:0005737" "GO:0006457" "GO:0043231" ...


In [9]:
# group genes by GO.ID
grouped.gene.GO.df <- flat.gene.GO.df %>%
  group_by(!!sym(GO.ID.column)) %>%
  summarise(!!gene.ID.column := paste(.data[[gene.ID.column]], collapse = ","))

str(grouped.gene.GO.df)

tibble [8,666 × 2] (S3: tbl_df/tbl/data.frame)
 $ Gene.Ontology.IDs: chr [1:8666] "GO:0000012" "GO:0000014" "GO:0000015" "GO:0000025" ...
 $ gene_id          : chr [1:8666] "g22030,g22031,g16480,g4241" "g13422,g21327" "g12040" "g9214" ...


In [10]:
# Vector of GO IDs
go_ids <- grouped.gene.GO.df[[GO.ID.column]]

str(go_ids)

 chr [1:8666] "GO:0000012" "GO:0000014" "GO:0000015" "GO:0000025" ...


In [11]:
# create new OBO collection object of just GOslims, based on provided GO IDs
# Create GSEAbase GOCollection using `go_ids`
myCollection <- GOCollection(go_ids)

# Retrieve GOslims from GO OBO file set
slim <- getOBOCollection(gseabase_files)

str(slim)

Formal class 'OBOCollection' [package "GSEABase"] with 7 slots
  ..@ .stanza     :'data.frame':	153 obs. of  1 variable:
  .. ..$ value: chr [1:153] "Root" "Term" "Term" "Term" ...
  ..@ .subset     :'data.frame':	22 obs. of  1 variable:
  .. ..$ value: chr [1:22] "Rhea list of ChEBI terms representing the major species at pH 7.3." "Term not to be used for direct annotation" "Terms planned for obsoletion" "AGR slim" ...
  ..@ .kv         :'data.frame':	2132 obs. of  3 variables:
  .. ..$ stanza_id: chr [1:2132] ".__Root__" ".__Root__" ".__Root__" ".__Root__" ...
  .. ..$ key      : chr [1:2132] "format-version" "data-version" "synonymtypedef" "synonymtypedef" ...
  .. ..$ value    : chr [1:2132] "1.2" "go/releases/2025-10-10/subsets/goslim_generic.owl" "syngo_official_label \"label approved by the SynGO project\"" "systematic_synonym \"Systematic synonym\" EXACT" ...
  ..@ evidenceCode: chr [1:26] "EXP" "IDA" "IPI" "IMP" ...
  ..@ ontology    : chr NA
  ..@ ids         : chr [1:141] "G

In [12]:
# Retrieve Biological Process (BP) GOslims
slimdf <- goSlim(myCollection, slim, "BP", verbose)
str(slimdf)

'data.frame':	72 obs. of  3 variables:
 $ Count  : int  45 12 8 330 37 51 13 3 61 38 ...
 $ Percent: num  0.788 0.21 0.14 5.777 0.648 ...
 $ Term   : chr  "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...


### 2. Perform mapping

In [13]:
# List of GOslims and all GO IDs from `go_ids`
gomap <- as.list(GOBPOFFSPRING[rownames(slimdf)])

# Maps `go_ids` to matching GOslims
mapped <- lapply(gomap, intersect, ids(myCollection))

# Append all mapped GO IDs to `slimdf`
# `sapply` needed to apply paste() to create semi-colon delimited values
slimdf$GO.IDs <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";")

# Remove "character(0) string from "GO.IDs" column
slimdf$GO.IDs[slimdf$GO.IDs == "character(0)"] <- ""

# Add self-matching GOIDs to "GO.IDs" column, if not present
for (go_id in go_ids) {
  # Check if the go_id is present in the row names
  if (go_id %in% rownames(slimdf)) {
    # Check if the go_id is not present in the GO.IDs column
    # Also removes white space "trimws()" and converts all to upper case to handle
    # any weird, "invisible" formatting issues.
    if (!go_id %in% trimws(toupper(strsplit(slimdf[go_id, "GO.IDs"], ";")[[1]]))) {
      # Append the go_id to the GO.IDs column with a semi-colon separator
      if (length(slimdf$GO.IDs) > 0 && nchar(slimdf$GO.IDs[nrow(slimdf)]) > 0) {
        slimdf[go_id, "GO.IDs"] <- paste0(slimdf[go_id, "GO.IDs"], "; ", go_id)
      } else {
        slimdf[go_id, "GO.IDs"] <- go_id
      }
    }
  }
}

str(slimdf)

'data.frame':	72 obs. of  4 variables:
 $ Count  : int  45 12 8 330 37 51 13 3 61 38 ...
 $ Percent: num  0.788 0.21 0.14 5.777 0.648 ...
 $ Term   : chr  "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...
 $ GO.IDs : chr  "GO:0000070;GO:0000082;GO:0000086;GO:0000132;GO:0000281;GO:0006977;GO:0007052;GO:0007076;GO:0007079;GO:0007080;G"| __truncated__ "GO:0000281;GO:0000915;GO:0000917;GO:0031991;GO:0032465;GO:0032467;GO:0036089;GO:0040038;GO:0061640;GO:0061952;G"| __truncated__ "GO:0001731;GO:0001732;GO:0002183;GO:1901194;GO:1903679;GO:2000765;GO:2000767; GO:0002181" "GO:0001771;GO:0001774;GO:0001776;GO:0001779;GO:0001782;GO:0001865;GO:0001867;GO:0001913;GO:0001916;GO:0001922;G"| __truncated__ ...


In [14]:
# "Flatten" file so each row is single GO ID with corresponding GOslim
# rownames_to_column needed to retain row name info
slimdf_separated <- as.data.frame(slimdf %>%
  rownames_to_column('GOslim') %>%
  separate_rows(GO.IDs, sep = ";"))

# Group by unique GO ID
grouped_slimdf <- slimdf_separated %>%
  filter(!is.na(GO.IDs) & GO.IDs != "") %>%
  group_by(GO.IDs) %>%
  summarize(GOslim = paste(GOslim, collapse = ";"),
            Term = paste(Term, collapse = ";"))


str(grouped_slimdf)

tibble [3,972 × 3] (S3: tbl_df/tbl/data.frame)
 $ GO.IDs: chr [1:3972] " GO:0000278" " GO:0002181" " GO:0002376" " GO:0003014" ...
 $ GOslim: chr [1:3972] "GO:0000278" "GO:0002181" "GO:0002376" "GO:0003014" ...
 $ Term  : chr [1:3972] "mitotic cell cycle" "cytoplasmic translation" "immune system process" "renal system process" ...


In [15]:
slimdf.sorted <- slimdf %>% arrange(desc(Count))

slim.count.df <- slimdf.sorted %>% 
  select(Term, Count)

str(slim.count.df)

'data.frame':	72 obs. of  2 variables:
 $ Term : chr  "anatomical structure development" "signaling" "cell differentiation" "immune system process" ...
 $ Count: int  1139 598 533 330 215 174 164 154 149 123 ...
