# Identify gene sets eriched in cell populations associated with genetic risk

In [None]:
library(fgsea)
library(GSA)

In [None]:
# load gene sets
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c7.all.v7.0.symbols.gmt.txt", #Immune
"/data/srlab/lrumker/MSigDB_Sets/h.all.v7.0.symbols.gmt.txt", #Hallmark
"/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Cannonical

GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

## Helper functions

In [None]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper=500, sizeBound_Lower=0){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [None]:
run_GSEA_byFile <-function(genesettype, rankList, filter_output=TRUE, 
                           GeneSet_SizeBound_Upper = 500, GeneSet_SizeBound_Lower = 0){
    # Genesetttype takes string values "immune", "canonical", and "immune"
    # rankList must be named with gene labels per value
    if(genesettype=="canonical"){
        i_file = 3
    }else if(genesettype=="hallmark"){
        i_file = 2
    }else if(genesettype=="immune"){
        i_file = 1
    }else{
        print("Geneset file label not recognized.")
        return
    }
    Gene_Sets_DB = parseGeneSetGMT(geneset_files[i_file], names(rankList), 
                                   GeneSet_SizeBound_Upper, GeneSet_SizeBound_Lower)
    input_genesets = Gene_Sets_DB$genesets
    names(input_genesets) = Gene_Sets_DB$geneset.names

    FGSEA_output = fgsea(input_genesets, rankList, #nperm=10000, 
                             minSize = 15, maxSize = 500, nproc = 0,
                             gseaParam = 1, BPPARAM = NULL)
    
    if(filter_output){
        FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]
        FGSEA_output = FGSEA_output[which(FGSEA_output$padj<0.05),]
    }
    return(FGSEA_output)
}

## Test enrichment for SLE PRS-associated phenotype

In [None]:
set.seed(0)
res = read.csv(paste0("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/SLE_Myeloid_vargenecors.csv"))
cors = res$cor
names(cors) = res$gene
sets = run_GSEA_byFile("hallmark", cors)
print(head(sets, 1))

### Export interferon response gene set

In [None]:
Gene_Sets_DB = GSA.read.gmt(geneset_files[2])
ifna_genes = Gene_Sets_DB$genesets[[which(Gene_Sets_DB$geneset.names =="HALLMARK_INTERFERON_ALPHA_RESPONSE")]]
write.csv(ifna_genes, "/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/ifna_geneset.csv")