# Gene set enrichment analyses

Note: this script imports objects generated by the first cells in the csaQTL_Figures notebook.

In [1]:
library(fgsea)
library(GSA)
library(xtable)

## Load gene sets

In [2]:
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c7.all.v7.0.symbols.gmt.txt", #Immune
"/data/srlab/lrumker/MSigDB_Sets/h.all.v7.0.symbols.gmt.txt", #Hallmark
"/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Cannonical

GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

## Helper functions

In [3]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper=500, sizeBound_Lower=0){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [4]:
run_GSEA_byFile <-function(genesettype, rankList, filter_output=TRUE, 
                           GeneSet_SizeBound_Upper = 500, GeneSet_SizeBound_Lower = 0){
    # Genesetttype takes string values "immune", "canonical", and "immune"
    # rankList must be named with gene labels per value
    if(genesettype=="canonical"){
        i_file = 3
    }else if(genesettype=="hallmark"){
        i_file = 2
    }else if(genesettype=="immune"){
        i_file = 1
    }else{
        print("Geneset file label not recognized.")
        return
    }
    Gene_Sets_DB = parseGeneSetGMT(geneset_files[i_file], names(rankList), 
                                   GeneSet_SizeBound_Upper, GeneSet_SizeBound_Lower)
    input_genesets = Gene_Sets_DB$genesets
    names(input_genesets) = Gene_Sets_DB$geneset.names

    FGSEA_output = fgsea(input_genesets, rankList, eps=0, 
                             minSize = 15, maxSize = 500, nproc = 0,
                             gseaParam = 1, BPPARAM = NULL)
    
    if(filter_output){
        FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]
        FGSEA_output = FGSEA_output[which(FGSEA_output$padj<0.05),c(1,3,4)]
    }
    return(FGSEA_output)
}

## Test enrichment for HALLMARK gene sets

In [5]:
set.seed(0)
all_res = data.frame()
for( lead_snp in c('11:128070535:A:G', '12:10583611:C:T', '19:16441973:G:A')){
    res = read.csv(paste0("/data/srlab/lrumker/MCSC_Project/cna-qtl/results/gwas_NK/cna_res_",
                          lead_snp,"_vargene_cors.csv"))
    cors = res$cor
    names(cors) = res$gene
    sets = run_GSEA_byFile("hallmark", cors, filter_output = FALSE)
    sets = sets[order(sets$padj),]
    new = sets[sets$padj<0.05,]
    new = new[,c("pathway", "padj")]
    new$csaQTL = rep(lead_snp, nrow(new))
    all_res = rbind(all_res,new)
}
all_res = all_res[,c("csaQTL", "pathway", "padj")]
colnames(all_res) = c("csaQTL", "Pathway", "P-adjusted")

p_strs = c()
for(i in c(1:nrow(all_res))){
   if(all_res[['P-adjusted']][i]<0.01){
       p_strs = c(p_strs, formatC(all_res[['P-adjusted']][i], format = "e", digits = 1))
   }else{
       p_strs = c(p_strs, as.character(round(all_res[['P-adjusted']][i], 3)))
   }
}

all_res[['P-adjusted']] = p_strs

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [6]:
print(xtable(all_res, type = "latex"), file = "/data/srlab/lrumker/MCSC_Project/cna-qtl/tables/supptable.csaQTL_GSEA.tex",
     include.rownames=FALSE)
all_res

csaQTL,Pathway,P-adjusted
<chr>,<chr>,<chr>
11:128070535:A:G,HALLMARK_TNFA_SIGNALING_VIA_NFKB,8.4e-07
11:128070535:A:G,HALLMARK_INTERFERON_GAMMA_RESPONSE,0.04
12:10583611:C:T,HALLMARK_TNFA_SIGNALING_VIA_NFKB,5.2e-12
12:10583611:C:T,HALLMARK_HYPOXIA,2.3e-06
12:10583611:C:T,HALLMARK_MTORC1_SIGNALING,0.0044
12:10583611:C:T,HALLMARK_IL2_STAT5_SIGNALING,0.0058
12:10583611:C:T,HALLMARK_INFLAMMATORY_RESPONSE,0.0067
12:10583611:C:T,HALLMARK_INTERFERON_GAMMA_RESPONSE,0.0067
12:10583611:C:T,HALLMARK_ESTROGEN_RESPONSE_LATE,0.012
12:10583611:C:T,HALLMARK_IL6_JAK_STAT3_SIGNALING,0.012


## Export gene sets for cytokine responses with enrichment

In [7]:
Gene_Sets_DB = parseGeneSetGMT(geneset_files[2], c('dummy'), 
                               GeneSet_SizeBound_Upper, GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [8]:
write.csv(input_genesets[["HALLMARK_TNFA_SIGNALING_VIA_NFKB"]], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/TNFA_geneset.csv")

In [9]:
write.csv(input_genesets[['HALLMARK_INTERFERON_GAMMA_RESPONSE']], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/IFNG_geneset.csv")

In [10]:
write.csv(input_genesets[['HALLMARK_IL2_STAT5_SIGNALING']], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/IL2_geneset.csv")

In [11]:
write.csv(input_genesets[['HALLMARK_IL6_JAK_STAT3_SIGNALING']], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/IL6_geneset.csv")