In [None]:
# args[1] = "./wound_genes/"
# args[2] = start
# args[3] = end

args = commandArgs(trailingOnly = TRUE)
message(sprintf("Hello %s", args[1L]))

library(stats)
library(sets)
library(data.table)
# Background set of genes
background_set <- fread("background_set.txt")

pathways <- readRDS("pathways.rds")

files <-list.files(args[1])
files <- files[grep("^GSE*", files)]
print(length(wound_files))
compute_enrichment <- function(foreground_genes,all_genes,background_genes=NULL,pathways=NULL) {
    if (is.null(pathways)) { 
        pathways <- load_pathways()
        pathways <- lapply( pathways, function(pathway_genes) {return(pathway_genes[pathway_genes %in% all_genes])})
    }
    fisher_enrichment_dt <- data.table(pathway=names(pathways),p_value=-1,odds_ratio=-1)
    if (is.null(background_genes)) {
        background_genes <- setdiff(all_genes,foreground_genes)
        #background_genes <- unique(unlist(pathways))
    }

    for (pathway in names(pathways)) {
        pathway_genes <- pathways[[pathway]]
        non_pathway_genes <- setdiff(all_genes,pathway_genes)

        num_in_pathway_and_foreground <- intersect(pathway_genes,foreground_genes) %>% length
        num_in_pathway_and_not_foreground <- intersect(pathway_genes,background_genes) %>% length
        num_not_in_pathway_and_foreground <- intersect(non_pathway_genes,foreground_genes) %>% length
        num_not_in_pathway_and_not_foreground <- intersect(non_pathway_genes,background_genes) %>% length
        fisher_mat <- matrix(c(num_in_pathway_and_foreground,num_in_pathway_and_not_foreground,
                              num_not_in_pathway_and_foreground,num_not_in_pathway_and_not_foreground),
                             nrow=2,ncol=2,byrow=T)
        
        test_res <- fisher.test(fisher_mat,alternative="g")
        pathway_ <- pathway
        fisher_enrichment_dt[pathway==pathway_,`:=`(p_value=test_res$p.value, odds_ratio=test_res$estimate,
        num_p_fg=num_in_pathway_and_foreground,num_p_bg=num_in_pathway_and_not_foreground,
        num_not_p_fg=num_not_in_pathway_and_foreground,num_not_p_bg=num_not_in_pathway_and_not_foreground)]
    }
    fisher_enrichment_dt[,q_value:=p.adjust(p_value)]

    return(fisher_enrichment_dt)
}
get_enrichment_data <- function(current_files, current_pathways, current_dir){
    total_enrichment_pathways <- names(pathways[[current_pathways]])
    #pathwaysDF <- data.frame("pathways" = total_enrichment_pathways)
    for(i in 1:length(current_files)) {
        current_file_path <- paste(current_dir, current_files[i], sep="")
        new_file_name <- paste(current_dir,current_pathways,"_enrichment_",current_files[i], sep="")
        
        total_genes <- fread(current_file_path, header=FALSE)

        colnames(total_genes) <- c("genes")
        enrichment_test <- compute_enrichment(foreground_genes = total_genes$genes,
                                               all_genes = background_set$gene,
                                               pathways = pathways[[current_pathways]])
        subset_enrichment_test  <- enrichment_test[,c("q_value")]
        rownames(subset_enrichment_test) <-  enrichment_test$pathway

        #pathwaysDF <- cbind(pathwaysDF, subset_enrichment_test)
        saveRDS(subset_enrichment_test, file = new_file_name)
    }
    
    #all_qvals <- pathwaysDF[,2:length(pathwaysDF)]
    #rownames(all_qvals) <- total_enrichment_pathways
    #colnames(all_qvals) <- current_files[1:length(all_qvals)]
    
    #enrichment_heatmap <- pheatmap(as.matrix(log(all_qvals)), show_rownames = F)
    #return(list("matrix" = all_qvals, "heatmap" = enrichment_heatmap))
}

get_enrichment_data(files[(args[2]):(end[3])], "GO", args[1])

In [None]:
#!/bin/bash
module load R/3.6.3
Rscript get_go_enrichment.R ./wound_genes/ 1 2

In [24]:
### command = "for f in {1..14..238}; do \necho \"$f\" \nprintf \"#!/bin/bash\nmodule load R\nRscript Rscript get_go_enrichment.R ./wound_genes/ $f $f+10 /data/timonaj/Allelic_Imbalance/eRNA_analysis/RPKM_non_coding/\" > temp_$f.job \nsbatch --partition=ccr --mem=300g --cpus-per-task=4 --gres=lscratch:2 --time 4:00:00 temp_$f.job \ndone"
command = "for f in {1..238..14}; do \necho \"$f\" \n((a=$f+13)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./wound_genes/ $f $a\" > temp_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 temp_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..238..14}; do \necho \"$f\" \n((a=$f+13)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./wound_genes/ $f $a\" > temp_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 temp_$f.job \ndone"


In [27]:
### command = "for f in {1..14..238}; do \necho \"$f\" \nprintf \"#!/bin/bash\nmodule load R\nRscript Rscript get_go_enrichment.R ./wound_genes/ $f $f+10 /data/timonaj/Allelic_Imbalance/eRNA_analysis/RPKM_non_coding/\" > temp_$f.job \nsbatch --partition=ccr --mem=300g --cpus-per-task=4 --gres=lscratch:2 --time 4:00:00 temp_$f.job \ndone"
command = "for f in {1..238..14}; do \necho \"$f\" \n((a=$f+13)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./wound_genes/ $f $a\" > wound_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 wound_c2_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..238..14}; do \necho \"$f\" \n((a=$f+13)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./wound_genes/ $f $a\" > wound_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 wound_c2_$f.job \ndone"


In [25]:
command = "for f in {1..114..6}; do \necho \"$f\" \n((a=$f+5)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./regen_genes/ $f $a\" > regen_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 regen_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..114..6}; do \necho \"$f\" \n((a=$f+5)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./regen_genes/ $f $a\" > regen_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 regen_$f.job \ndone"


In [28]:
command = "for f in {1..114..6}; do \necho \"$f\" \n((a=$f+5)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./regen_genes/ $f $a\" > regen_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 regen_c2_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..114..6}; do \necho \"$f\" \n((a=$f+5)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./regen_genes/ $f $a\" > regen_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 regen_c2_$f.job \ndone"


In [26]:
command = "for f in {1..160..10}; do \necho \"$f\" \n((a=$f+9)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./stress_genes/ $f $a\" > stress_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 stress_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..160..10}; do \necho \"$f\" \n((a=$f+9)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_go_enrichment.R ./stress_genes/ $f $a\" > stress_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 stress_$f.job \ndone"


In [29]:
command = "for f in {1..160..10}; do \necho \"$f\" \n((a=$f+9)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./stress_genes/ $f $a\" > stress_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 stress_c2_$f.job \ndone"
print(command)
try(system(command))

[1] "for f in {1..160..10}; do \necho \"$f\" \n((a=$f+9)) \necho \"$a\" \nprintf \"#!/bin/bash\nmodule load R\nRscript get_C2_enrichment.R ./stress_genes/ $f $a\" > stress_c2_$f.job \nsbatch --partition=ccr --mem=10g --cpus-per-task=2 --gres=lscratch:2 --time 4:00:00 stress_c2_$f.job \ndone"
