-
Notifications
You must be signed in to change notification settings - Fork 2
/
topGOanalysis.R
executable file
·448 lines (326 loc) · 18 KB
/
topGOanalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
#!/usr/bin/env Rscript
#source("http://bioconductor.org/biocLite.R")
#biocLite("topGO")
library(topGO)
library(memisc)
library(Rgraphviz)
args=(commandArgs(TRUE))
if(length(args) < 5 ){
print("Missing input arguments :" )
print("args[1] Full path to gene-Goterm relationship file")
print("args[2] Output type of gene-Goterm relationship file [blast2GO/argot2/other]")
print("args[4] Full path to candidate genes, e.g DE expressed genes.")
print("args[3] Type of function. This is dependent on the naming in the gene-Goterm relationship file but in most cases it is one of [F/P/C]")
print("args[4] Full path to candidate genes, e.g DE expressed genes.")
print("args[5] Prefix for results for filename of GO analysis")
print("args[6] Full path to background genes [Optional if not entire gene set should be used]")
print("args[7] Ortholog gene relationship [Optional if go terms are associated with another species genes]")
print("")
print("Example start line :")
print("./topGOanalysis.R data/Hmel1.21.cdhit.ARGOT2.GOannotation.txt argot2 P data/GOI.txt firstGenes")
print("")
}
ModelGeneGOrelationshipFile = args[1] #Full path to GOterm annotated genes
GOtermFunction = args[2] #Output type of gene-Goterm relationship file [blast2GO/argot2/other]
significantGenesFile = args[3] #Full path to candidate genes File
backgroundGenesFile = "Not present"
if(length(args) > 3){
backgroundGenesFile = args[4] #Genes that will be considered as background
}
ModelGeneOrthologsFile = "Not present"
if(length(args) > 4){
ModelGeneOrthologsFile = args[5] #Genes that will be considered as background
}
ModelGeneGOrelationshipFileFormat = args[6] #Outputfrom [blast2GO/argot2]
if(length(args) > 5){
ModelGeneOrthologsFile = args[5] #Genes that will be considered as background
}
outName = args[7] # Prefix for results for filename of GO analysis
getGOtermFunctions <- function(ModelGeneGOrelationshipFile,ModelGeneGOrelationshipFileFormat){
if(ModelGeneGOrelationshipFileFormat == "blast2GO"){
# SeqName Hit-Desc GO-Group GO-ID Term
# HMEL025033-PA homeobox protein abdominal-a homolog F GO:0003700 sequence-specific DNA binding transcription factor activity
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Name", "Function", "GOterm", "Description")
}else if(ModelGeneGOrelationshipFileFormat == "argot2"){
# Sequence Aspect GO ID Name Total Score Internal Confidence Information Content
# HMEL015695-PA F GO:0004222 metalloendopeptidase activity 6.728758473824388 0.5 7.3129653461121995
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Function", "GOterm", "Name", "Total Score", "Internal Confidence","Information Content")
}else{
##Asumes that there are three colums with 1.GeneName 2.Function 3. Goterm and it containts a header
# Locus Function GOterm
# HMEL015695-PA F GO:0004222
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Function", "GOterm")
}
Functions = unique(ModelGeneGOrelationship$Function)
return (Functions)
}
getTopGOdata <- function(ModelGeneGOrelationshipFile,ModelGeneGOrelationshipFileFormat,GOtermFunction,significantGenesFile,
backgroundGenesFile = "Not present",ModelGeneOrthologsFile="Not present"){
#############################################################
# Load the ortholog information START
##############################################################
#Data structure specific for genomes found in phytozome.
if(ModelGeneOrthologsFile != "Not present"){
ModelGeneOrthologs <- read.table(ModelGeneOrthologsFile, header=F, fill=T, sep="\t", quote="", stringsAsFactors=F)
#Add names to make it functional for this script
names(ModelGeneOrthologs) <-c("Locus","Ortholog")
}else{
ModelGeneOrthologs <- "Not present"
}
#############################################################
# Load the ortholog information STOP
##############################################################
#############################################################
# Load the background list and Gene of interest information START
##############################################################
#If named "Not present" then all Locuses will be considered
if(backgroundGenesFile != "Not present"){
BackgroundList <- unique(read.table(backgroundGenesFile, header=F)[[1]])
print("")
print(paste("Defining set of locuses that will serve as background from this file",backgroundGenesFile, sep=" : ") )
print(paste("Number of unique locuses ",length(BackgroundList), sep=" : ") )
print("")
print("")
}else{
BackgroundList <- "Not present"
print("")
print(paste("No background locus list is defined. All locuses associated with a GO term will serve as background") )
print("")
print("")
}
#Genes of interest, should be a subset of BackgroundList
GOIList <- read.table(significantGenesFile, header=F)[[1]]
print("")
print(paste("Defining set of locuses that will serve as significant genes from this file",backgroundGenesFile, sep=" : ") )
print(paste("Number of unique locuses ",length(GOIList), sep=" : ") )
print("")
print("")
############################################################
# Load the background list and Gene of interest information STOP
##############################################################
#############################################################
# Load the Gene GOTerm relationship table START
##############################################################
if(ModelGeneGOrelationshipFileFormat == "blast2GO"){
# SeqName Hit-Desc GO-Group GO-ID Term
# HMEL025033-PA homeobox protein abdominal-a homolog F GO:0003700 sequence-specific DNA binding transcription factor activity
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Name", "Function", "GOterm", "Description")
}else if(ModelGeneGOrelationshipFileFormat == "argot2"){
# Sequence Aspect GO ID Name Total Score Internal Confidence Information Content
# HMEL015695-PA F GO:0004222 metalloendopeptidase activity 6.728758473824388 0.5 7.3129653461121995
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Function", "GOterm", "Name", "Total Score", "Internal Confidence","Information Content")
}else{
##Asumes that there are three colums with 1.GeneName 2.Function 3. Goterm and it containts a header
# Locus Function GOterm
# HMEL015695-PA F GO:0004222
ModelGeneGOrelationship <- read.table(ModelGeneGOrelationshipFile, header=T, fill=T, sep="\t", quote="", stringsAsFactors=F)
colnames(ModelGeneGOrelationship) <- c("Locus", "Function", "GOterm")
}
#############################################################
# Load the Gene GOTerm relationship table STOP
##############################################################
#############################################################
# Reformat the ortholog information START
##############################################################
if(ModelGeneOrthologs != "Not present"){
ModelGeneOrthologsRightFormat = getRightOrthoInfo(ModelGeneOrthologs,mRNA2Gene = TRUE)
printOrthoInfo(ModelGeneOrthologsRightFormat)
}else{
ModelGeneOrthologsRightFormat = data.frame(Locus=unique(ModelGeneGOrelationship$Locus),Ortholog=unique(ModelGeneGOrelationship$Locus))
}
#############################################################
# Reformat the ortholog information STOP
##############################################################
#############################################################
# Filter for background list of genes and
# and mark genes of interest (GOI) START
##############################################################
if(BackgroundList == "Not present"){
BackgroundList = ModelGeneOrthologsRightFormat$Locus
}
ModelGeneOrthologsRightFormat = filterGeneList(ModelGeneOrthologsRightFormat,BackgroundList,GOIList)
#printOrthoInfoAfterFiltering(ModelGeneOrthologsRightFormat)
# Run function which filters the files based on BackgroundList and marks based on GOIList
#############################################################
# Filter for background list of genes and
# and mark genes of interest (GOI) STOP
##############################################################
#############################################################
# Change the GOterm table into right format
# and filter for only those present in List START
##############################################################
ModelGeneGOrelationship <- getGOtermFormat(ModelGeneGOrelationship,Function=GOtermFunction,LocusList =ModelGeneOrthologsRightFormat$Ortholog)
printGOtableInfo(ModelGeneGOrelationship)
#############################################################
# Change the GOterm table into right format
# and filter for only those present in List START
##############################################################
#############################################################
# Move the format into the way TopGO wants it.
# and load TopGO START
##############################################################
geneList = getTopGOGenelistFormat(ModelGeneGOrelationship$Locus,ModelGeneOrthologsRightFormat$Ortholog[ModelGeneOrthologsRightFormat$GOI==1])
gene2GOlist =getTopGOGene2GOrelationshipFormat(ModelGeneGOrelationship)
topGOdata <- new("topGOdata", description=outName, ontology="BP", allGenes=geneList, annot = annFUN.gene2GO, gene2GO = gene2GOlist, nodeSize=0)
#############################################################
# Move the format into the way TopGO wants it.
# and load TopGO STOP
##############################################################
return (topGOdata)
}
#statistic tests
printTopGOresults <- function(topGOdata,outName){
test.statFis <- new("classicCount", testStatistic=GOFisherTest, name="Fisher test")
resultFis <- getSigGroups(topGOdata, test.statFis)
resultFisadj <- resultFis
Fisadj <- p.adjust(score(resultFis),method="BH")
score(resultFisadj) <- Fisadj
# test.statKS <- new("classicScore", testStatistic=GOKSTest, name="KS tests")
# resultKS <- getSigGroups(topGOdata, test.statKS)
# resultKSadj <- resultKS
# KSadj <- p.adjust(score(resultKS),method="BH")
# score(resultKSadj) <- KSadj
# test.statElim <- new("elimCount", testStatistic=GOFisherTest, name="Fisher test", cutOff=0.01)
# resultElim <- getSigGroups(topGOdata, test.statElim)
# test.statWei <- new("weightCount", testStatistic=GOFisherTest, name="Fisher test", sigRatio="ratio")
# resultWei <- getSigGroups(topGOdata, test.statWei)
#list <- list(classic=score(resultFis), KS=score(resultKS), elim=score(resultElim), weight=score(resultWei))
#listadj <- list(classicadj=score(resultFisadj), KSadj=score(resultKSadj), elim=score(resultElim), weight=score(resultWei))
#allRes <- GenTable(topGOdata, classicFisher = resultFis, classicKS = resultKS, elimFisher = resultElim, WeightedFisher = resultWei, orderBy="classicFisher", ranksOf="classicFisher", topNodes=30)
#allResadj <- GenTable(topGOdata, classicFisheradj = resultFisadj, classicKSadj = resultKSadj, elimFisher = resultElim, WeightedFisher = resultWei, orderBy="WeightedFisher", ranksOf="classicFisheradj", topNodes=30)
list <- list(classic=score(resultFis))
listadj <- list(classicadj=score(resultFisadj))
allRes <- GenTable(topGOdata, classicFisher = resultFis, orderBy="classicFisher", ranksOf="classicFisher", topNodes=30)
allResadj <- GenTable(topGOdata, classicFisheradj = resultFisadj, orderBy="classicFisheradj", ranksOf="classicFisheradj", topNodes=30)
#output of data
#saves everything in Results order within the folder one is at
if (!file.exists("results")){
dir.create(file.path("results"))
}
write.table(allResadj, file=paste("results/",outName,"_stat_adj.txt",sep=""), row.names=FALSE, col.names=TRUE)
# pdf(paste("results/GO_graphFis_",outName,"_15nodes_adj.pdf",sep=""))
# showSigOfNodes(topGOdata, score(resultFisadj), firstSigNodes=15, useInfo="all")
# dev.off()
write.table(allRes, file=paste("results/",outName,"_stat.txt",sep=""), row.names=FALSE, col.names=TRUE)
pdf(paste("results/P_value_distribution_",outName,".pdf",sep=""))
par(mfrow = c(2,2))
for (nn in names(list)){
p.val <- list[[nn]]
hist(p.val[p.val < 1], br = 50, xlab = "p values",
main = paste("Histogram for method:", nn, sep = " "))
}
dev.off()
pdf(paste("results/GO_graphFis_",outName,"_15nodes.pdf",sep=""))
showSigOfNodes(topGOdata, score(resultFis), firstSigNodes=15, useInfo="all")
dev.off()
#pdf(paste("results/GO_graphFis_TEST_NEW_3_15nodes.pdf",sep=""))
#showSigOfNodes(topGOdata, score(resultFis), firstSigNodes=15, useInfo="all")
#dev.off()
#pdf(paste("results/GO_graphWeiFis_",outName,"_15nodes.pdf",sep=""))
#showSigOfNodes(topGOdata, score(resultWei), firstSigNodes=15, useInfo="all")
#dev.off()
}
getGOtermFormat <- function(ModelGeneGOrelationship, Function="P",LocusList=ModelGeneGOrelationship$Locus,GOtermList=ModelGeneGOrelationship$GOterm){
if(checkFormat(ModelGeneGOrelationship,c("Locus","GOterm","Function"))){
cat("GOterm table format is correct. \n")
#Keep only columns that are interesting for this purpose
ModelGeneGOrelationship = ModelGeneGOrelationship[,c("Locus","GOterm","Function")]
#Filter based on function
ModelGeneGOrelationship = ModelGeneGOrelationship[which(ModelGeneGOrelationship$Function == Function),]
#Filter based on Locus
ModelGeneGOrelationship = ModelGeneGOrelationship[ModelGeneGOrelationship$Locus %in% LocusList,]
#Filter based on GOterm
ModelGeneGOrelationship = ModelGeneGOrelationship[ModelGeneGOrelationship$GOterm %in% GOtermList,]
return (ModelGeneGOrelationship)
}else{
stop("Something wrong with the GOterm table column names")
}
return (NULL)
}
printOrthoInfo <- function(ModelGeneOrthologsRightFormat){
cat("This is the informtion about the orthologs\n")
print(head(ModelGeneOrthologsRightFormat))
cat("Nr of unique Locuses:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$Locus)))
cat("\nNr of unique orthologs:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$Ortholog)))
cat("\n\n")
}
printOrthoInfoAdvanced <- function(ModelGeneOrthologsRightFormat){
cat("This is the informtion about the orthologs\n")
print(head(ModelGeneOrthologsRightFormat))
cat("Nr of unique Locuses:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$Locus)))
cat("\nNr of unique orthologs:\n")
cat(length(ModelGeneOrthologsRightFormat$GOI==1))
cat("\n\n")
}
printGOtableInfo <- function(ModelGeneOrthologsRightFormat){
cat("This is the informtion about the GOtable\n")
print(head(ModelGeneOrthologsRightFormat))
cat("\nNr of unique Functions:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$Function)))
cat("\nNr of unique Locuses:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$Locus)))
cat("\nNr of unique GOterms:\n")
cat(length(unique(ModelGeneOrthologsRightFormat$GOterm)))
cat("\n\n")
}
checkFormat <- function(dat,Mandatory){
found = Mandatory %in% colnames(dat)
return (all(found))
}
getRightOrthoInfo <- function(ModelGeneOrthologs,mRNA2Gene=FALSE){
if(checkFormat(ModelGeneOrthologs,c("Locus","Ortholog"))){
cat("Ortholog format is correct. Getting right Ortho info\n")
#Keep only interesting columns
ModelGeneOrthologs <- ModelGeneOrthologs[ , c("Locus","Ortholog")]
#Remove rows where there is no ortholog
ModelGeneOrthologs = ModelGeneOrthologs[ModelGeneOrthologs$Ortholog>0 , ]
if(mRNA2Gene){
##Going from isoform to gene not always applicable
ModelGeneOrthologs$Ortholog =getLocusFromGeneName(ModelGeneOrthologs$Ortholog)
}
return (ModelGeneOrthologs)
}else{
stop("Something wrong with the Ortholog column names")
}
return (NULL)
}
getLocusFromGeneName <-function(GeneName){
#This assumes geneName to be AT1GXXXXX.1
Raw = unlist(strsplit(GeneName,"[.]") )
Locus = Raw[seq(1,length(Raw)-1,by=2)]
return (Locus)
}
filterGeneList <- function(GenesWithOrthologs, SubsetList=GenesWithOrthologs$Locus, GOIList){
#GenesWithOrthologs should be a two row data.frame with names "Locus" "Ortholog"
#First remove all GeneNames that is not in the SubsetList
FilteredGenesWithOrthologs <- GenesWithOrthologs[GenesWithOrthologs$Locus %in% SubsetList,]
#Add column if Gene with ortholog is interesting
FilteredGenesWithOrthologs$GOI <- as.integer(FilteredGenesWithOrthologs$Locus %in% GOIList)
return(FilteredGenesWithOrthologs)
}
getTopGOGenelistFormat <- function(Genes,GenesOfInterest){
#First step
Genes = unique(Genes)
GenesGOI <- factor(as.integer(Genes %in% GenesOfInterest))
names(GenesGOI) <- Genes
return (GenesGOI)
}
getTopGOGene2GOrelationshipFormat <- function(ModelGeneGOrelationship){
gene2GOlist_sb <- split(ModelGeneGOrelationship$GOterm, ModelGeneGOrelationship$Locus)
gene2GOlist_sb <- lapply(gene2GOlist_sb, unique) #remove duplicates
return (gene2GOlist_sb)
}
topGOdata = getTopGOdata(ModelGeneGOrelationshipFile,ModelGeneGOrelationshipFileFormat,GOtermFunction,significantGenesFile,backgroundGenesFile=backgroundGenesFile)
printTopGOresults(topGOdata,outName)
# set workddir for the RNAs
#nodeSize: prunes GO hierarchy for less than X genes for one GO term
#statistic tests
#Fis & KS adjusted, others not adjusted