In [None]:
# make overrepresentation analyses
# create
#  SupplTable-S8-overrepresentation-analysis.xls

In [1]:
library(org.Hs.eg.db) # gene annotation, for mapping symbols to entrez/ensembl
library(clusterProfiler) # GO enrichment analysis
library(xlsx)
library(parallel)

Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: Biobase

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: IRanges

Loading required package: S4Vectors

In [275]:
basePath = "/data/bcu_projects/MelBrainSys_PostdocProject_Gruetzmann/publications/2022-my-MelBrainSys-paper/scripts-etc-for-publication/"
myPath = paste0(basePath,"regNet/")
setwd(basePath)

In [276]:
outDirectory = paste0(basePath,"FiguresTables/")

In [277]:
# define met. pairs, and annotation colors 
tmp = readRDS(file = "annotation/samplePairs-annotation-colors-clusters.rds")
samplePairPerSubgroup = tmp$samplePairPerSubgroup
subgroupNames = tmp$subgroupNames

In [278]:
tmp = readRDS(file="metPairs-impactRatios-onAllTargetGenes.rds")
impRatios = tmp$meanLogMedianImpRatiosMat
head(impRatios)

Unnamed: 0,P03_BLun,P04_BSki_1,P08_BSof_1,P08_BSof_2,P08_BSof_3,P16_BLun,P18_BLun_1,P18_BLun_2,P39_BLun,P42_BLym_1,P42_BLym_2
NOC2L,-0.5131848,0.2623647,-0.8237658,-0.04970504,-0.3760514,0.197820589,-0.29186426,-0.6056519,-1.0009443,0.427153,-0.31009925
KLHL17,-0.375472,0.5195951,-0.8366981,-0.26829236,-0.3536398,-0.008449568,-0.90867937,-0.5893162,-1.1208155,0.3428841,-0.29978671
HES4,-1.0220673,0.3757407,-0.7503114,-0.014979,0.2059771,-1.191426658,-0.16027045,-0.816423,-1.9369733,-1.0808609,-0.5177954
ISG15,-0.4778416,0.2964649,-0.855836,0.30105722,0.0270101,0.398693258,0.0584264,-0.3556355,-0.8483902,0.3199234,-0.3710177
AGRN,-0.8571891,0.5567957,-0.9730449,0.11305796,-0.3439669,-0.111755277,-0.09810649,-0.4588938,-1.0630752,0.6730077,-0.09384669
C1orf159,-0.4562156,0.3341612,-0.8622403,-0.06471666,-0.6813369,0.540506357,-0.21481722,-0.4516024,-1.1938917,0.41877,0.25238287


In [280]:
allGenesNWs = rownames(impRatios)

In [281]:
# mapping all genes to Entrez
# "ENTREZID"
allGenesNWsMapped <- bitr(allGenesNWs, fromType = "SYMBOL",toType = c("ENSEMBL","ENTREZID"),OrgDb = org.Hs.eg.db)
# 5.2% failed mapping (when only mapping on ENSEMBL/ENTREZ it is 5.16 and 5.18%)
wh = which(duplicated(allGenesNWsMapped$SYMBOL))
length(wh) # 324 duplicated, remove the duplicates
allGenesNWsMapped = allGenesNWsMapped[-wh,]
rownames(allGenesNWsMapped) = allGenesNWsMapped$SYMBOL
head(allGenesNWsMapped,2)

'select()' returned 1:many mapping between keys and columns

“6.08% of input gene IDs are fail to map...”


Unnamed: 0_level_0,SYMBOL,ENSEMBL,ENTREZID
Unnamed: 0_level_1,<chr>,<chr>,<chr>
NOC2L,NOC2L,ENSG00000188976,26155
KLHL17,KLHL17,ENSG00000187961,339451


In [301]:
# GO ID / descr mapping:
goIDdescrMap = function(res) {
    tmp = res[,c("ID","Description")]
    wh = which(duplicated(tmp$ID))
    if(length(wh)>0) {tmp = tmp[-wh,]}
    goIDdescr = tmp$Description; names(goIDdescr) = tmp$ID
    return(goIDdescr)
}

In [283]:
calcOverrepr1set = function( geneSet, allGenesSet, maxQval) {
    geneSetEnsg = na.omit(allGenesSet[geneSet, "ENSEMBL"])
    geneSetEntr = na.omit(allGenesSet[geneSet, "ENTREZID"]) 
    res = NULL
    for (goCateg in c("MF","CC","BP")) {
        go = enrichGO(gene = geneSetEnsg, universe = allGenesSet$ENSEMBL,
                    OrgDb = org.Hs.eg.db, keyType = 'ENSEMBL', ont = goCateg, 
                    pAdjustMethod = "BH", pvalueCutoff  = 1, qvalueCutoff  = maxQval, readable = T)
        if(class(go)!="NULL" & nrow(go)>0) {
            res = rbind(res,data.frame(stringsAsFactors=F, categ = goCateg,num_genes=length(geneSetEnsg),go))
        }
    }
    kegg = enrichKEGG(gene = geneSetEntr, universe = allGenesSet$ENTREZID, qvalueCutoff = maxQval,
             keyType = 'ncbi-geneid', organism = 'hsa',pvalueCutoff = 1)
    if(class(kegg)!="NULL" & nrow(kegg)>0) {
        res = rbind(res,data.frame(stringsAsFactors=F, categ = "KEGG" ,num_genes=length(geneSetEntr) ,kegg))
    }
    res
}

In [284]:
calcOverReprWcutoff = function(impRatios, decreasing, nbCPUs, percentile) {
    GOres = mclapply(mc.cores = nbCPUs,X = colnames(impRatios), FUN = function(samplePair) {
            topGenes = head(sort(impRatios[,samplePair], decreasing = decreasing),
                            round(percentile*nrow(allGenesNWsMapped)/100))
            if(decreasing) { 
                topGenes = topGenes[ which(topGenes > 0)] 
            } else {
                topGenes = topGenes[ which(topGenes < 0)] 
            }
            message(samplePair," ",length(topGenes)," topGenes")
            if (length(topGenes)>0) {
                topGenes = names(topGenes)
                tmp = calcOverrepr1set(geneSet = topGenes, allGenesSet = allGenesNWsMapped, maxQval = 0.1)
                if(class(tmp)!="NULL") {
                    return(data.frame(stringsAsFactors = F,samplePair = samplePair,tmp))
                } else { return(NULL) }
            } else { return(NULL) }
        })
    do.call(rbind,GOres)
}

In [285]:
set.seed(seed = 42) # reproducible results

In [311]:
nbCPUs = 1 # parallelization doesn't work, enrichment functions break with error "database disk image is malformed"
allRevMeanGOres = calcOverReprWcutoff(impRatios = impRatios, decreasing = F,
                                      nbCPUs = nbCPUs,percentile = 5 )
# takes 15-30min

P03_BLun 388 topGenes

P04_BSki_1 388 topGenes

P08_BSof_1 388 topGenes

P08_BSof_2 388 topGenes

P08_BSof_3 388 topGenes

P16_BLun 388 topGenes

P18_BLun_1 388 topGenes

P18_BLun_2 388 topGenes

P39_BLun 388 topGenes

P42_BLym_1 358 topGenes

P42_BLym_2 388 topGenes



In [312]:
allFwdMeanGOres = calcOverReprWcutoff(impRatios = impRatios, decreasing = T,
                                      nbCPUs = nbCPUs,percentile = 5 )
# takes 15-30min

P03_BLun 388 topGenes

P04_BSki_1 388 topGenes

P08_BSof_1 45 topGenes

P08_BSof_2 388 topGenes

P08_BSof_3 332 topGenes

P16_BLun 388 topGenes

P18_BLun_1 388 topGenes

P18_BLun_2 156 topGenes

P39_BLun 67 topGenes

P42_BLym_1 388 topGenes

P42_BLym_2 388 topGenes



In [313]:
head(allFwdMeanGOres,2)
tail(allFwdMeanGOres,2)

Unnamed: 0_level_0,samplePair,categ,num_genes,ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>
GO:0009055,P03_BLun,MF,362,GO:0009055,electron transfer activity,16/348,67/7474,4.436038e-08,1.425754e-05,1.389729e-05,ADH5/SDHC/NDUFS8/COX6B1/NDUFA8/COX4I1/NDUFA4/NDUFB3/AKR1B1/NDUFB7/NDUFB9/COX5B/COX6A1/NDUFB4/NDUFA1/NDUFV2,16
GO:0015453,P03_BLun,MF,362,GO:0015453,oxidoreduction-driven active transmembrane transporter activity,13/348,44/7474,5.410829e-08,1.425754e-05,1.389729e-05,NDUFS8/COX6B1/NDUFA8/COX4I1/NDUFA4/NDUFB3/NDUFB7/NDUFB9/COX5B/COX6A1/NDUFB4/NDUFA1/NDUFV2,13


Unnamed: 0_level_0,samplePair,categ,num_genes,ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>
GO:0050729,P42_BLym_2,BP,371,GO:0050729,positive regulation of inflammatory response,8/344,54/7283,0.003539606,0.1061455,0.09902026,STAT5B/AIM2/ADAM8/ALOX5AP/LGALS1/IL15/CEBPA/CCR7,8
GO:00507762,P42_BLym_2,BP,371,GO:0050776,regulation of immune response,30/344,379/7283,0.003544456,0.1061455,0.09902026,LGALS9/TRAFD1/DDX3X/NOD2/RBCK1/PRKCH/SLAMF6/GATA3/PRKD2/TLR6/SKAP1/ICOSLG/TRAF6/BTN3A3/RNF31/IRF1/STAT5B/AIM2/CD79B/ADAM8/ARG2/PTPN22/DUSP3/IL15/BCL10/PAK1/CRKL/CCR7/EIF2AK4/SERPINB9,30


In [329]:
# save for paper
write.xlsx(allFwdMeanGOres, sheetName="upper 5%",append=F, row.names = F,
           file = paste0(outDirectory,"SupplTable-S8-overrepresentation-analysis.xls"))
# save for paper
write.xlsx(allRevMeanGOres, sheetName="lower 5%", append=T,row.names = F,
           file = paste0(outDirectory,"SupplTable-S8-overrepresentation-analysis.xls"))

### exclusive GOs
shared only among met. pairs of a subgroup

In [315]:
goIDdescr = goIDdescrMap(res = rbind(allFwdMeanGOres, allRevMeanGOres))

In [316]:
FwdAndRevResults = list("upper 5%" = allFwdMeanGOres, 
                     "lower 5%"=allRevMeanGOres)

In [317]:
sharedGOs = NULL # contains all results that are in >= 2 samples
anyGOs = 
    list("upper 5%" = NULL, "lower 5%"=NULL) 
    # anyGOs: for each list (upper/lower), and each SG, which GO terms are overrepresented
    # needed later to get exclusive GO terms
for(geneList in names(FwdAndRevResults)) {
    message(geneList)
    goRes = FwdAndRevResults[[geneList]]
    for(SG in names(subgroupNames)) {
        SPs = samplePairPerSubgroup[[SG]]
        anyGOs[[geneList]][[SG]] = 
            unique(sort(goRes$ID[ goRes$samplePair %in% SPs]))
        frqGO = sort(table(goRes$ID[ goRes$samplePair %in% SPs]), decreasing = T)
        frqGO = frqGO[ frqGO > 1]
        message("subgroup ",paste0(SG,": ",paste0(SPs, collapse=" ")),
                ", ",length(frqGO)," shared categories")
        if(length(frqGO)>0) {
            tmp = data.frame(stringsAsFactors = F, gene_list= geneList, 
                        subgroup = SG, num_sample_pairs = length(SPs),
                        num_sample_pairs_sharing = as.numeric(frqGO),
                             GO_ID = names(frqGO),description = goIDdescr[names(frqGO)])
            out = ""
            for(i in 1:min(10,nrow(tmp))) { out = paste0(out,paste0(paste0(" | ",tmp[i,],collapse=""),collapse=" | ")," |\n") }
            message(out)
            sharedGOs = rbind(sharedGOs,tmp)
        }
    }
}

upper 5%

subgroup SG1: P04_BSki_1 P08_BSof_2 P16_BLun P42_BLym_1, 2 shared categories

 | upper 5% | SG1 | 4 | 2 | GO:1903053 | regulation of extracellular matrix organization |
 | upper 5% | SG1 | 4 | 2 | GO:1903055 | positive regulation of extracellular matrix organization |


subgroup SG2: P08_BSof_1 P18_BLun_2 P39_BLun, 0 shared categories

subgroup SG3: P03_BLun P08_BSof_3 P18_BLun_1 P42_BLym_2, 34 shared categories

 | upper 5% | SG3 | 4 | 2 | GO:0001775 | cell activation |
 | upper 5% | SG3 | 4 | 2 | GO:0002253 | activation of immune response |
 | upper 5% | SG3 | 4 | 2 | GO:0002429 | immune response-activating cell surface receptor signaling pathway |
 | upper 5% | SG3 | 4 | 2 | GO:0002684 | positive regulation of immune system process |
 | upper 5% | SG3 | 4 | 2 | GO:0002694 | regulation of leukocyte activation |
 | upper 5% | SG3 | 4 | 2 | GO:0002696 | positive regulation of leukocyte activation |
 | upper 5% | SG3 | 4 | 2 | GO:0002757 | immune response-activating signal tra

In [318]:
head(sharedGOs,3)

Unnamed: 0_level_0,gene_list,subgroup,num_sample_pairs,num_sample_pairs_sharing,GO_ID,description
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>
GO:1903053,upper 5%,SG1,4,2,GO:1903053,regulation of extracellular matrix organization
GO:1903055,upper 5%,SG1,4,2,GO:1903055,positive regulation of extracellular matrix organization
GO:0001775,upper 5%,SG3,4,2,GO:0001775,cell activation


In [319]:
exclSharedGOs = NULL
for (geneList in unique(sharedGOs$gene_list)) {
    message(geneList)
    for(SG in unique(sharedGOs$subgroup)) {
        otherSGs = setdiff(names(subgroupNames),SG)
        unwantedGOs = unique(unlist(anyGOs[[geneList]][otherSGs]))
        tmp = sharedGOs[ sharedGOs$gene_list==geneList & sharedGOs$subgroup==SG,]
        a = length(unique(tmp$GO_ID))
        tmp = tmp[ !tmp$GO_ID %in% unwantedGOs,]
        b = length(unique(tmp$GO_ID))
        message("  ",SG," -> other SGs: ", paste0(otherSGs, collapse=" "), ", ",
                length(unwantedGOs)," unwanted GOs, ", a, " before ",b," after removal" )
        exclSharedGOs = rbind(exclSharedGOs,tmp)
    }
}

upper 5%

  SG1 -> other SGs: SG2 SG3, 279 unwanted GOs, 2 before 2 after removal

  SG3 -> other SGs: SG1 SG2, 137 unwanted GOs, 34 before 29 after removal

  SG2 -> other SGs: SG1 SG3, 366 unwanted GOs, 0 before 0 after removal

lower 5%

  SG1 -> other SGs: SG2 SG3, 470 unwanted GOs, 283 before 72 after removal

  SG3 -> other SGs: SG1 SG2, 761 unwanted GOs, 13 before 1 after removal

  SG2 -> other SGs: SG1 SG3, 722 unwanted GOs, 90 before 1 after removal



In [322]:
head(exclSharedGOs,3)

Unnamed: 0_level_0,gene_list,subgroup,num_sample_pairs,num_sample_pairs_sharing,GO_ID,description
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>
GO:1903053,upper 5%,SG1,4,2,GO:1903053,regulation of extracellular matrix organization
GO:1903055,upper 5%,SG1,4,2,GO:1903055,positive regulation of extracellular matrix organization
GO:0001775,upper 5%,SG3,4,2,GO:0001775,cell activation


In [326]:
table(sharedGOs$subgroup, sharedGOs$gene_list)

     
      lower 5% upper 5%
  SG1      283        2
  SG2       90        0
  SG3       13       34

In [323]:
table(exclSharedGOs$subgroup, exclSharedGOs$gene_list)

     
      lower 5% upper 5%
  SG1       72        2
  SG2        1        0
  SG3        1       29

In [324]:
exclSharedGOs$subgroup_ID = exclSharedGOs$subgroup
exclSharedGOs$subgroup = paste(subgroupNames[ exclSharedGOs$subgroup],"in brain")

In [327]:
exclSharedGOs = exclSharedGOs[,c('gene_list','subgroup','subgroup_ID',
                                 'num_sample_pairs','num_sample_pairs_sharing','GO_ID','description')]
head(exclSharedGOs)
table(exclSharedGOs$subgroup, exclSharedGOs$subgroup_ID)

Unnamed: 0_level_0,gene_list,subgroup,subgroup_ID,num_sample_pairs,num_sample_pairs_sharing,GO_ID,description
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>
GO:1903053,upper 5%,higher in brain,SG1,4,2,GO:1903053,regulation of extracellular matrix organization
GO:1903055,upper 5%,higher in brain,SG1,4,2,GO:1903055,positive regulation of extracellular matrix organization
GO:0001775,upper 5%,slightly lower in brain,SG3,4,2,GO:0001775,cell activation
GO:0002253,upper 5%,slightly lower in brain,SG3,4,2,GO:0002253,activation of immune response
GO:0002429,upper 5%,slightly lower in brain,SG3,4,2,GO:0002429,immune response-activating cell surface receptor signaling pathway
GO:0002694,upper 5%,slightly lower in brain,SG3,4,2,GO:0002694,regulation of leukocyte activation


                         
                          SG1 SG2 SG3
  higher in brain          74   0   0
  lower in brain            0   1   0
  slightly lower in brain   0   0  30

In [330]:
write.xlsx(exclSharedGOs, sheetName="exclusive categories", append=T, row.names = F,
           file = paste0(outDirectory,"SupplTable-S8-overrepresentation-analysis.xls"))