# Gene set enrichment analyses

Note: this script imports objects generated by the first cells in the csaQTL_Figures notebook.

In [1]:
library(fgsea)
library(GSA)

## Load gene sets

In [2]:
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c7.all.v7.0.symbols.gmt.txt", #Immune
"/data/srlab/lrumker/MSigDB_Sets/h.all.v7.0.symbols.gmt.txt", #Hallmark
"/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Cannonical

GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

## Helper functions

In [3]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper=500, sizeBound_Lower=0){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [4]:
run_GSEA_byFile <-function(genesettype, rankList, filter_output=TRUE, 
                           GeneSet_SizeBound_Upper = 500, GeneSet_SizeBound_Lower = 0){
    # Genesetttype takes string values "immune", "canonical", and "immune"
    # rankList must be named with gene labels per value
    if(genesettype=="canonical"){
        i_file = 3
    }else if(genesettype=="hallmark"){
        i_file = 2
    }else if(genesettype=="immune"){
        i_file = 1
    }else{
        print("Geneset file label not recognized.")
        return
    }
    Gene_Sets_DB = parseGeneSetGMT(geneset_files[i_file], names(rankList), 
                                   GeneSet_SizeBound_Upper, GeneSet_SizeBound_Lower)
    input_genesets = Gene_Sets_DB$genesets
    names(input_genesets) = Gene_Sets_DB$geneset.names

    FGSEA_output = fgsea(input_genesets, rankList, eps=0, 
                             minSize = 15, maxSize = 500, nproc = 0,
                             gseaParam = 1, BPPARAM = NULL)
    
    if(filter_output){
        FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]
        FGSEA_output = FGSEA_output[which(FGSEA_output$padj<0.05),c(1,3,4)]
    }
    return(FGSEA_output)
}

## Test enrichment for HALLMARK gene sets

In [5]:
set.seed(0)
for( lead_snp in c('11:128070535:A:G', '12:10583611:C:T', '19:16441973:G:A')){
    res = read.csv(paste0("/data/srlab/lrumker/MCSC_Project/cna-qtl/results/gwas_NK/cna_res_",
                          lead_snp,"_vargene_cors.csv"))
    cors = res$cor
    names(cors) = res$gene
    sets = run_GSEA_byFile("hallmark", cors, filter_output = FALSE)
    sets = sets[order(sets$padj),]
    print(head(sets, 10))
}

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
                               pathway         pval         padj   log2err
 1:   HALLMARK_TNFA_SIGNALING_VIA_NFKB 3.668209e-08 8.436880e-07 0.7195128
 2: HALLMARK_INTERFERON_GAMMA_RESPONSE 3.489197e-03 4.012577e-02 0.4317077
 3:                   HALLMARK_HYPOXIA 9.583741e-03 7.347535e-02 0.3807304
 4:       HALLMARK_IL2_STAT5_SIGNALING 1.774570e-02 1.020378e-01 0.3524879
 5:    HALLMARK_ESTROGEN_RESPONSE_LATE 3.531653e-02 1.353800e-01 0.3217759
 6:     HALLMARK_INFLAMMATORY_RESPONSE 3.330415e-02 1.353800e-01 0.3217759
 7:                 HALLMARK_APOPTOSIS 5.349995e-02 1.757855e-01 0.3217759
 8:                HALLMARK_MYOGENESIS 8.944544e-02 2.571556e-01 0.1999152
 9:           HALLMARK_MITOTIC_SPINDLE 1.021505e-01 2.573427e-01 0.1864326
10:          HALLMARK

## Export gene sets for TNF-alpha and IFN-gamma response

In [6]:
Gene_Sets_DB = parseGeneSetGMT(geneset_files[2], c('dummy'), 
                               GeneSet_SizeBound_Upper, GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [7]:
write.csv(input_genesets[['HALLMARK_INTERFERON_GAMMA_RESPONSE']], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/IFNG_geneset.csv")

In [8]:
write.csv(input_genesets[["HALLMARK_TNFA_SIGNALING_VIA_NFKB"]], 
          "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/annotation/TNFA_geneset.csv")