go_term_compare.Rmd

---
title: "Compare GO Terms"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

### Necessary packages:

```{r message=FALSE}
library(GO.db)
library(data.table)
library(dplyr)
library(stringr)
```

# Filter ontologies based on molecular function, biological process, or cellular component
```{r}
ontology <- "MF"
filterOntology <- function(go_list, ontology){
    if (ontology == "all") { return(go_list) }
    onts <- Ontology(go_list)
    return(go_list[onts == ontology & !is.na(onts)])
}
```

# GO terms, direct from tools

## eggNOG
```{r}
eggnog <- read.delim("data/tool_outputs/func_tools/eggnogmap_results/diamond_annotations.tabular",
                     stringsAsFactors = FALSE,
                     header = FALSE)
# get all unique go terms from all peptides
eggnog_gos <- unlist(str_split(eggnog$V6, ","))
# take out empty entry
eggnog_gos_clean <- eggnog_gos[str_sub(eggnog_gos, 1, 2) == "GO"]

length(eggnog$V2)
length(unique(eggnog$V2)) # Total number of terms
length(eggnog_gos_clean)
length(eggnog_gos_clean %>% unique())
length(filterOntology(eggnog_gos_clean, ontology))
length(filterOntology(eggnog_gos_clean, ontology) %>% unique())
```

```{r}
eggnog_filtered <- read.delim("results/mqome/output/eggnog/eggnog_func_filtered.tab", stringsAsFactors = FALSE) %>%
    mutate(log2ratio = log2((countWS + 1)/(countNS + 1))) %>%
    arrange(-log2ratio)
```


We are going to write the lists to file, for later use in Python. First, I define a function `clean_write()` which writes a list without all the quotes and stuff:
```{r}
clean_write <- function(object, file){
    write.table(object, file=file, quote = FALSE, row.names = FALSE, col.names = FALSE)
}
```

Write list to file for later use:
```{r}
eggnog_list_file <- "data/databases/gene_ontology/go_lists/eggnog.tab"
clean_write(eggnog_filtered$id, file=eggnog_list_file)
```


## MEGAN6

These go terms come from querying the eggNOG database with the orthologous group ids, contained in `MEGAN_outputs/737NSvsWS_EGGNOGcount.csv`. The Python function to do this is in `MEGAN_outputs/get_gos_from_ogs.py`. 

```{r}
megan <- read.delim("data/tool_outputs/func_tools/MEGAN_outputs/go_terms.txt",
                        header = TRUE,
                        stringsAsFactors = FALSE)
megan_go_lists <- megan$gos
megan_gos <- unlist(str_split(megan_go_lists, pattern = ",")) %>% unique()
megan_gos_clean <- megan_gos[str_sub(megan_gos, 1, 2) == "GO" & !is.na(megan_gos)]
clean_write(megan_gos_clean, file="data/databases/gene_ontology/go_lists/megan.tab")
megan
length(megan$og)
length(megan$og %>% unique)
length(megan_gos_clean)
length(megan_gos_clean %>% unique)
length(filterOntology(megan_gos_clean, ontology))
length(filterOntology(megan_gos_clean, ontology) %>% unique)
```

## MetaGOmics
```{r}
metagomics <- read.delim("data/tool_outputs/func_tools/metaGOmics_results/go_compare_149_150.txt",
                             stringsAsFactors = FALSE,
                             comment.char = "#")
metagomics_gos <- metagomics$GO.acc %>%  unique()
# takes out "unknownprc", "unknowncmp", "unknownfun"
metagomics_gos_clean <- metagomics_gos[str_sub(metagomics_gos, 1, 2) == "GO"]
clean_write(metagomics_gos_clean, file="data/databases/gene_ontology/go_lists/metagomics.tab")

length(metagomics$GO.acc) # total number of terms
length(metagomics$GO.acc %>% unique()) # total number of terms
length(filter(metagomics, Laplace.corr..q.value < 0.05)$GO.acc)
length(filter(metagomics, Laplace.corr..q.value < 0.05)$GO.acc %>% unique())
length(metagomics_gos_clean)
length(metagomics_gos_clean %>% unique())
length(filterOntology(metagomics_gos_clean, ontology)) # Number of translated GO terms
length(filterOntology(metagomics_gos_clean, ontology) %>% unique())
length(filterOntology(filter(metagomics, Laplace.corr..q.value < 0.05)$GO.acc, ontology)) #significant at FDR < 5%
length(filterOntology(filter(metagomics, Laplace.corr..q.value < 0.05)$GO.acc, ontology) %>% unique()) #significant at FDR < 5%
```

## MetaProteomeAnalyzer

Here, we get the Uniprot IDs from all of the files, then upload them to the UniProt "Retrieve/ID Mapping" service (https://www.uniprot.org/uploadlists/).  

```{r}
# get all uniprot ids
ns_files <- list.files('data/tool_outputs/func_tools/mpa_results/NS', pattern = "_proteins.csv", full.names = TRUE)
protNS <- bind_rows(lapply(ns_files, function(i) read.delim(i, stringsAsFactors = FALSE))) %>%
    select("prot" = Protein.Accession)

ws_files <- list.files('data/tool_outputs/func_tools/mpa_results/WS', pattern = "_proteins.csv", full.names = TRUE)
protWS <- bind_rows(lapply(ws_files, function(i) read.delim(i, stringsAsFactors = FALSE))) %>%
    select("prot" = Protein.Accession)

protAll <- rbind(protNS, protWS)

# write for use on uniprot
clean_write(protAll %>% unique(), file="data/tool_outputs/func_tools/mpa_results/all_proteins.tab")

# these are the uniprot results from the above file
mpa_uniprot <- read.delim('data/tool_outputs/func_tools/mpa_results/uniprot_protein_results.tab', stringsAsFactors = FALSE)
mpa_gos <- str_trim(unlist(str_split(mpa_uniprot$Gene.ontology.IDs, ";"))) 
mpa_gos_clean <- mpa_gos[str_sub(mpa_gos, 1, 2) == "GO"]

length(mpa_uniprot$Entry)
length(mpa_uniprot$Entry %>% unique())
length(mpa_gos_clean)
length(mpa_gos_clean %>% unique())
length(filterOntology(mpa_gos_clean, ontology))
length(filterOntology(mpa_gos_clean, ontology) %>% unique())

clean_write(mpa_gos_clean, file="data/databases/gene_ontology/go_lists/mpa.tab")
```

## Prophane

```{r}
prophane_ws <- read.delim("data/tool_outputs/func_tools/prophane_results/summary_WS.txt", sep = "\t",
                             stringsAsFactors = FALSE)
prophane_ns <- read.delim("data/tool_outputs/func_tools/prophane_results/summary_NS.txt", sep = "\t",
                             stringsAsFactors = FALSE)
prophane_gos <- c(prophane_ws$task_6..fun_from_TIGRFAMs_15_cut_tc..tigrfam2GO,
                  prophane_ws$task_7..fun_from_PFAMs_32..pfam2GO,
                  prophane_ws$task_8..fun_from_eggNog_4.5.1..og2GO,
                  prophane_ns$task_6..fun_from_TIGRFAMs_15_cut_tc..tigrfam2GO,
                  prophane_ns$task_7..fun_from_PFAMs_32..pfam2GO,
                  prophane_ns$task_8..fun_from_eggNog_4.5.1..og2GO)
prophane_gos <- prophane_gos[prophane_gos != "-"]
prophane_gos <- trimws(unlist(strsplit(prophane_gos, ",")))
prophane_gos_clean <- prophane_gos[str_sub(prophane_gos, 1, 2) == "GO"]
clean_write(prophane_gos_clean, file="data/databases/gene_ontology/go_lists/prophane.tab")

length(unlist(strsplit(prophane_ws$members_identifier, ";")))
length(unique(unlist(strsplit(prophane_ws$members_identifier, ";"))))
length(prophane_gos_clean)
length(prophane_gos_clean %>% unique())
length(filterOntology(prophane_gos_clean, ontology))
length(filterOntology(prophane_gos_clean, ontology) %>% unique())
```

## Unipept

For Unipept, the results are divided into BP, MF, and CC files, so I combine the three ontologies for WS and NS. 

```{r}
unipept_results_NS <- paste('data/tool_outputs/func_tools/unipept_results/',
                               list.files("data/tool_outputs/func_tools/unipept_results/", pattern = "^737NS.*\\.csv"),
                               sep = "")
unipept_results_WS <- paste('data/tool_outputs/func_tools/unipept_results/',
                               list.files("data/tool_outputs/func_tools/unipept_results/", pattern = "^737WS.*\\.csv"),
                               sep = "")
unipeptNS <- lapply(unipept_results_NS, function(i) {
        read.delim(i, sep = ',', as.is = TRUE)}) %>%
    bind_rows() %>%
    select(-X) %>%
    rename(peptides = X.peptides)
unipeptWS <- lapply(unipept_results_WS, function(i) {
    read.delim(i, sep = ',', as.is = TRUE)}) %>%
    bind_rows() %>%
    select(-X) %>%
    rename(peptides = X.peptides)

unipept_all <- inner_join(unipeptNS, unipeptWS, by = c("GO.term", "Name")) %>%
    rename(countNS = peptides.x, countWS = peptides.y) %>%
    mutate(log2ratio = log(countWS/countNS))

unipept_gos <- unipept_all$GO.term
unipept_gos_clean <- unipept_gos[str_sub(unipept_gos, 1, 2) == "GO"]
clean_write(unipept_gos_clean, file="data/databases/gene_ontology/go_lists/unipept.tab")

unipept_all
length(unipept_gos_clean)
length(unipept_gos_clean %>% unique())
length(filterOntology(unipept_gos_clean, ontology))
length(filterOntology(unipept_gos_clean, ontology) %>% unique())
```
```{r}
unipept_filtered <- read.delim("results/mqome/output/unipept/func_filtered.tab", stringsAsFactors = FALSE) %>%
    mutate(log2ratio = log2((countWS + 1)/(countNS + 1))) %>%
    arrange(-log2ratio)
```

```{r}
clean_write(unipept_filtered$id, file="data/databases/gene_ontology/go_lists/unipept.tab")
```


## Euler Diagram of Results
```{r}
gos <- list('eggnog' = filterOntology(eggnog_filtered$id, ontology),
            'megan' = filterOntology(megan_gos_clean, ontology),
            'metagomics' = filterOntology(metagomics_gos_clean, ontology),
            'mpa' = filterOntology(mpa_gos_clean, ontology),
            'prophane' = filterOntology(prophane_gos_clean, ontology),
            'unipept' = filterOntology(unipept_filtered$id, ontology))
```

# GO terms: all parents

Each of the tools produces a list of GO terms, but we don't necessarily know how the tools assign terms - if a protein or peptide matches a single term, some tools might annotate that protein or peptide with the term and all of its ancestors, while some might annotate the tool with only the term itself. Thus, to reduce this kind of bias, we get all of the ancestors of all of the annotated terms, and produce another Venn diagram.  

Download the current GO ontology:
```{bash results='hide'}
#curl http://current.geneontology.org/ontology/go-basic.obo -o GO_files/go-basic.obo
#curl http://current.geneontology.org/ontology/subsets/goslim_generic.obo -o GO_files/go-slim.obo
```

Run python to get the list of all ancestors and slim GO terms
```{r}
library(reticulate)
source_python('data/databases/gene_ontology/go-converter.py')
```

Read ancestors into R. 

```{r}
methods <- c("eggnog", "megan", "metagomics", "mpa", "prophane", "unipept")
gos_ancestors <- lapply(methods,
    function(f) {
        "go" = read.delim(paste("data/databases/gene_ontology/go_lists/", f, '_parents.tab', sep = ""),
                              header = FALSE,
                              stringsAsFactors=FALSE)$V1
    }
)
names(gos_ancestors) <- methods
```

# Results:

## Number of GO terms in each tool

Without ancestors
```{r}
sapply(gos, length)
```

With ancestors
```{r}
sapply(gos_ancestors, length)
```

Unique to each tool
```{r}
ex <- rep(0, 6)
tot <- rep(0, 6)
for (i in 1:6){
    all <- 1:6
    other_ind <- all[-i]
    others <- unique(c(unlist(gos[other_ind])))
    ex[i] <- length(setdiff(gos[[i]], others))
    tot[i] <- length(gos[[i]])
}
names(tot) <- names(gos)
tot
names(ex) <- names(gos)
ex
```

## GO terms in GOslim

```{r}
methods <- c("eggnog", "megan", "metagomics", 'mpa', 'prophane', "unipept")
gos_slims <- lapply(methods,
    function(f) {
        "go" = read.delim(paste("data/databases/gene_ontology/go_lists/", f, '_slim.tab', sep = ""),
                              header = FALSE,
                              stringsAsFactors=FALSE)$V1
    }
)
names(gos_slims) <- methods

exslim <- rep(0, 6)
tot <- rep(0, 5)
for (i in 1:6){
    all <- 1:6
    other_ind <- all[-i]
    others <- unique(c(unlist(gos_slims[other_ind])))
    gos_slim <- filterOntology(gos_slims[[i]],ontology)
    exslim[i] <- length(setdiff(gos_slim, others))
    tot[i] <- length(gos_slim)
}
names(tot) <- names(gos)
tot
names(exslim) <- methods
exslim
```

## Overlap Plot 

```{r}
library(reshape2)
library(ggplot2)

generate_overlap_plot <- function(ontology){
    
    gos <- list('eggnog' = filterOntology(eggnog_filtered$id, ontology),
            'megan' = filterOntology(megan_gos_clean, ontology),
            'metagomics' = filterOntology(metagomics_gos_clean, ontology),
            'mpa' = filterOntology(mpa_gos_clean, ontology),
            'prophane' = filterOntology(prophane_gos_clean, ontology),
            'unipept' = filterOntology(unipept_filtered$id, ontology))
    
    jaccard <- function(vec1, vec2){
        c1 <- length(vec1)
        c2 <- length(vec2)
        int <- length(intersect(vec1, vec2))
        int / c1
    }
    
    jacmat <- matrix(0, nrow = 6, ncol = 6)
    for (i in 1:6){
        for (j in 1:6){
            jacmat[i, j] <- jaccard(unique(gos[[i]]), unique(gos[[j]]))
        }
    }
    dimnames(jacmat) <- list(names(gos), names(gos))
    
    jacslim <- matrix(0, nrow = 6, ncol = 6)
    for (i in 1:6){
        for (j in 1:6){
            jacslim[i, j] <- jaccard(unique(filterOntology(gos_slims[[i]], ontology)),
                                     unique(filterOntology(gos_slims[[j]], ontology)))
        }
    }
    dimnames(jacslim) <- list(names(gos), names(gos))
    
    jac_melt <- melt(jacmat)
    jac_melt$method <- "Full GO"
    
    jac_melt_slim <- melt(jacslim)
    jac_melt_slim$method <- "Slim GO"
    
    jac_melt_all <- rbind(jac_melt, jac_melt_slim)
    ggplot(jac_melt_all)+
        geom_tile(aes(x = Var1, y = Var2, fill = value), color = "black") +
        geom_text(aes(x = Var1, y = Var2, label = format(value, digits = 2))) + 
        scale_fill_gradient(name="Jaccard\nIndex", low = "white", high = "dodgerblue", limits = c(0, 1),
                            guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")) +
        facet_grid(.~method) +
        theme_minimal(base_size = 14) +
        theme(aspect.ratio = 1) +
        labs(x = NULL, y = NULL)
    ggsave(paste0("results/overlap_plots/jaccard-", ontology, ".png"), height = 5, width = 12, units = "in", dpi = 700)
}

generate_overlap_plot("all")
generate_overlap_plot("MF")
generate_overlap_plot("BP")
generate_overlap_plot("CC")
```