In [1]:
# Imports

# NOTE: ggplot2, Rcpp, fgsea, data.table, and tidyr are not native R libraries and must
# be installed on the users machine prior to this script running.

library(tidyr)
library(fgsea)
library(ggplot2)
library(Rcpp)
library(data.table)
library(stringi)

Loading required package: Rcpp


In [2]:
# User input path to the cluster gene set tsv file
cluster_path <- 'week_13_no_batch_correction/gene_for_gsea_list.tsv'

# User input path to the combined gene set model.
# NOTE: This is dependent on the user or if 
# there was only one gene set that used in
# cluster_geneset_pre-analysis
gene_set_path <- 'gene_models/combined_genesets.gmt'

In [3]:
### Function ###
# Description: Load the cluster file into a R data table
# :param cluster_path: A string that is the path to the cluster file
# return:
# cluster_table: A R data table that contains the clusters and their z-score ranked genes
generate_cluster_table <- function(cluster_path){
    cluster_table <- read.table(file = cluster_path, sep = '\t', header = TRUE)
    return(cluster_table)
}

In [4]:
### Function ###
# Description: Compute fgsea on each cluster against the gene model in the provided files
# and output the fgseaRes table as a txt file and plotGseaTable as a png file to the
# users local directory.
# :param gene_set_path: A string to the users gene model
# :param cluster_table: A string to the users cluster file
# :param output_label: A string that will be the suffix to the output
# png and txt files.
# return: None

# Suppress file encoding warning (has no affect on gene set enrichment analysis)
options(warn=-1)
execute_fgsea <- function(gene_set_path, cluster_table, output_label){
    # Obtain number of clusters in cluster file
    num_clusters <- unique(cluster_table$cluster_number)
    # Load gene set model
    pathways <- gmtPathways(gene_set_path)
    # Iterate through the clusters in the data table and compute fgsea
    for (cluster in num_clusters){
        # Subset cluster based on iteration number
        cluster_set <- cluster_table[cluster_table$cluster_number == cluster,]
        # Create a ranked list of that cluster's gene z-score
        ranks <- setNames(cluster_set$z_score, cluster_set$gene)
        # Run fgsea with default paramaters
        fgseaRes <- fgsea(pathways, 
                      ranks, 
                      minSize=15, 
                      maxSize=500, 
                      nperm=15000,
                      gseaParam = 0.5)
        
        # Output the cluster's fgsea table to the local directory
        output_gsea_table_file_name <- paste(output_label, cluster, "gseaTable.png", sep="_")
        
        # Compute the five top and bottom pathways based on fgsea's enrichment score
        topPathwaysUp <- fgseaRes[ES > 0, ][head(order(pval), n=5), pathway]
        topPathwaysDown <- fgseaRes[ES < 0, ][head(order(pval), n=5), pathway]
        topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
        
        # Generate plotGseaTable for that cluster and output the
        # png to the local directory
        png(filename=output_gsea_table_file_name, 
            units="in", 
            width=15, 
            height=10, 
            pointsize=12, 
            res=72)
        plotGseaTable(pathways[topPathways], ranks, fgseaRes, gseaParam = 0.5)
        dev.off()
        
        # Generate an ouput fgseaRes table for that cluster
        output_fgseaRes_file_name <- paste(output_label, cluster, "fgseaRes.txt", sep="_")
        fwrite(fgseaRes, file=output_fgseaRes_file_name, sep="\t", sep2=c("", " ", "")) 
    }
}

In [None]:
### Executable Cell ### (Main) 

cluster_table <- generate_cluster_table(cluster_path)

# NOTE: The last parameter is a user input for the the naming of 
# the output fgseaRes table and gsea table plot. In this case,
# the naming was based on the week the cells were sequenced.
execute_fgsea(gene_set_path, cluster_table, 'week_13_cluster')