/
diffbind2.R
335 lines (284 loc) · 18.3 KB
/
diffbind2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#####################################
##
## What: diffbind2.R
## Who : Sergi Sayols, Frank Rühle
## When: 10-05-2016, updated 03-03-2021
##
## Script to perform differential binding analysis between 2 conditions (pairwise) using DiffBind v2
##
## Args:
## -----
## targets=targets.txt # file describing the targets.
## contrasts=contrasts.txt # file describing the contrasts
## cwd=./ # current working directory where the files .tsv files are located
## bams=paste0(CWD, "/mapped") # directory with the bam files
## peaks=paste0(CWD, "/results/macs2") # directory with peak caller output
## out=paste0(CWD, "/results") # prefix filename for output
## fragsize=200 # average fragment size
## substractControl=TRUE # substract input
## fullLibrarySize=TRUE # use total number of reads in bam for normalization (FALSE=only peaks)
## tagwiseDispersion=TRUE # calculate dispersion tagwise (use FALSE if no replicates)
## annotate=TRUE # annotate after DB analysis?
## pe=FALSE # paired end experiment?
## tss=c(-3000,3000) # region around the tss
## txdb=TxDb.Mmusculus.UCSC.mm9.knownGene # Bioconductor transcript database, for annotation
## annodb=org.Mm.eg.db # Bioconductor gene annotation database
##
######################################
options(stringsAsFactors=FALSE)
library(DiffBind)
library(openxlsx)
library(dplyr)
library(ggplot2)
##
## get arguments from the command line
##
parseArgs <- function(args,string,default=NULL,convert="as.character") {
if(length(i <- grep(string,args,fixed=T)) == 1)
return(do.call(convert,list(gsub(string,"",args[i]))))
if(!is.null(default)) default else do.call(convert,list(NA))
}
run_custom_code <- function(x) {
eval(parse(text=x))
}
args <- commandArgs(T)
DIFFBINDVERSION <- parseArgs(args,"diffbindversion=", 2, "as.numeric") # selected DiffBind version
FTARGETS <- parseArgs(args,"targets=","targets.txt") # file describing the targets
FCONTRASTS <- parseArgs(args,"contrasts=","contrasts_diffbind.txt") # file describing the contrasts
CWD <- parseArgs(args,"cwd=","./") # current working directory
BAMS <- parseArgs(args,"bams=",paste0(CWD, "/mapped")) # directory with the bam files
PEAKS <- parseArgs(args,"peaks=",paste0(CWD, "/results/macs2")) # directory with the peak files
OUT <- parseArgs(args,"out=", paste0(CWD, "/results")) # directory where the output files will go
FRAGSIZE <- parseArgs(args,"fragsize=", 200, "as.numeric")# fragment size
SUMMITS <- parseArgs(args,"summits=", 0, "as.numeric") # summits for re-centering consensus peaks
FILTER <- parseArgs(args,"filter=", 0, "as.numeric") # value to use for filtering intervals with low read counts
SUBSTRACTCONTROL <- parseArgs(args,"substractControl=", FALSE, "as.logical") # substract input
ANALYSISMETHOD <- parseArgs(args,"analysisMethod=", "DESeq2", "as.character") # method for which to normalize (either "DESeq2" or "edgeRGLM")
LIBRARYSIZE <- parseArgs(args,"librarySize=", "full", "as.character") # use total number of reads in bam for normalization (FALSE=only peaks)
FULLLIBRARYSIZE <- if(tolower(LIBRARYSIZE) %in% tolower(c("full", "background", "default", "true", "t"))) {TRUE} else {FALSE}
TAGWISEDISPERSION <- parseArgs(args,"tagwiseDispersion=", FALSE, "as.logical") # calculate dispersion tagwise (use FALSE if no replicates)
FDR_TRESHOLD <- parseArgs(args,"fdr_threshold=", 0.05, "as.numeric") # summits for re-centering consensus peaks
FOLD <- parseArgs(args,"fold=", 0, "as.numeric") # summits for re-centering consensus peaks
ANNOTATE <- parseArgs(args,"annotate=", FALSE, "as.logical") # annotate after DB analysis?
PE <- parseArgs(args,"pe=", FALSE, "as.logical") # paired end experiment?
TSS <- parseArgs(args,"tss=", "c(-3000,3000)", "run_custom_code") # region around the tss
TXDB <- parseArgs(args,"txdb=", "TxDb.Mmusculus.UCSC.mm9.knownGene") # Bioconductor transcript database, for annotation
ANNODB <- parseArgs(args,"annodb=", "org.Mm.eg.db") # Bioconductor gene annotation database
GENOMEDB <- parseArgs(args,"genomedb=", "mm10") # Bioconductor gene annotation database
runstr <- "Rscript diffbind.R [targets=targets.txt] [contrasts=contrasts.txt] [cwd=./] [bams=./mapped] [peaks=./results/macs2] [out=./results] [fragsize=200] [summits=0] [annotate=TRUE] [pe=TRUE] [tss=c(-3000,3000)] [txdb=TxDb.Mmusculus.UCSC.mm9.knownGene] [annodb=org.Mm.eg.db]"
if(!file.exists(CWD)) stop("Dir",CWD,"does NOT exist. Run with:\n",runstr)
setwd(CWD)
if(!file.exists(FTARGETS)) stop("File",FTARGETS,"does NOT exist. Run with:\n",runstr)
if(!file.exists(FCONTRASTS)) stop("File",FCONTRASTS,"does NOT exist. Run with:\n",runstr)
if(!file.exists(BAMS)) stop("Dir",BAMS,"does NOT exist. Run with:\n",runstr)
if(!file.exists(PEAKS)) stop("Dir",PEAKS,"does NOT exist. Run with:\n",runstr)
if(!is.numeric(FRAGSIZE)) stop("Fragment size not numeric. Run with:\n",runstr)
if(!is.numeric(SUMMITS)) stop("Summits is not numeric. Run with:\n",runstr)
if(!is.numeric(FILTER)) stop("Filter threshold is not numeric. Run with:\n",runstr)
if(!is.logical(SUBSTRACTCONTROL)) stop("substractControl not logical. Run with:\n",runstr)
if(!is.logical(FULLLIBRARYSIZE)) stop("fulllibrarySize not logical. Run with:\n",runstr)
if(!is.logical(TAGWISEDISPERSION)) stop("tagwiseDispersion not logical. Run with:\n",runstr)
if(!is.numeric(FDR_TRESHOLD)) stop("fdr threshold is not numeric. Run with:\n",runstr)
if(!is.numeric(FOLD)) stop("Fold threshold is not numeric. Run with:\n",runstr)
if(!is.logical(ANNOTATE)) stop("Annotate not logical. Run with:\n",runstr)
if(!is.logical(PE)) stop("Paired end (pe) not logical. Run with:\n",runstr)
if(ANNOTATE & !is.numeric(TSS)) stop("Region around TSS not numeric. Run with:\n",runstr)
if(ANNOTATE & !require(ANNODB, character.only=TRUE)) stop("Annotation DB", ANNODB, "not installed\n")
if(ANNOTATE) {
if(grepl("\\.gtf$", TXDB)){ # check the input format for the transcript annotation
library(GenomicFeatures)
txdb <- makeTxDbFromGFF(TXDB, format="gtf") # if the input format is gtf file, then this file will be used to create a TxDb object
} else {
if(!require(TXDB, character.only=TRUE)) stop("Transcript DB", TXDB, "not installed\n")
txdb <- eval(parse(text=TXDB)) # if the input format is bioconductor, then the transcript annoation library will be used
}
}
# check for DiffBind version
currentDiffbindVersion <- packageVersion('DiffBind')
DiffBindWarningText <- ""
if(currentDiffbindVersion < 3) {
warning(paste0("You are using an outdated DiffBind version: ", currentDiffbindVersion, ". Beginning with version 3, DiffBind has included new functionalities and modified default settings. This script is written for the older version. However, you may also consider using the newer version.\n"))
} else {
DiffBindWarningText <- "This script is made for compatibility with older DiffBind versions < 3 and may not run properly for the newer DiffBind version."
warning(paste0(DiffBindWarningText, " You are currently using DiffBind version ", currentDiffbindVersion, ". Please set ESSENTIAL_DIFFBIND_VERSION=", currentDiffbindVersion$major, " to use the appropriate DiffBind module.\n"))
}
# some function calls differ depending on DiffBind version
##
## make DB analysis
##
# load targets and make analysis
conts <- read.delim(FCONTRASTS, head=T, stringsAsFactors = F, comment.char="#")
targets <- read.delim(FTARGETS, head=T, colClasses="character", comment.char="#")
# determine file suffixes for targets
donefiles <- list.files(PEAKS,pattern=".done$")
bam_suffix <- sub("^[^\\.]*\\.", "", gsub("_macs2.done$", "", donefiles[1]))
bam_suffix <- paste0(bam_suffix, ".bam")
# check if blacklist filtering was applied
peakfiles <- list.files(PEAKS,pattern=".xls")
isBlacklistFilt <- any(grepl("blacklist_filtered", peakfiles))
peak_suffix <- if(isBlacklistFilt) {"_macs2_blacklist_filtered_peaks.xls"} else {"_macs2_peaks.xls"}
# check if any targets$INPUT is indicated as "none". If so, Peak calling was done without Input samples.
isInputNone <- any(tolower(targets$INPUT) == "none")
# create modified targets file for diffbind
targets <- data.frame(
SampleID= targets$IPname,
Condition= targets$group,
Replicate= targets$Replicate,
bamReads= paste0(BAMS, "/", targets$IP, ".", bam_suffix),
ControlID= targets$INPUTname,
bamControl= paste0(BAMS, "/", targets$INPUT, ".", bam_suffix),
Peaks= paste0(PEAKS, "/", targets$IPname, if(!isInputNone) {paste0(".vs.", targets$INPUTname)}, peak_suffix),
PeakCaller= targets$PeakCaller
)
if(isInputNone) {
targets$ControlID <- NULL
targets$bamControl <- NULL
}
# Construct DBA object
db <- dba(sampleSheet=targets, config=data.frame(AnalysisMethod=DBA_DESEQ2, fragmentSize=FRAGSIZE, th=FDR_TRESHOLD,
bCorPlot=F, singleEnd=!PE))
# create DBA object containing consensus peaks per group (needed later)
db2 <- dba.peakset(db, consensus=DBA_CONDITION)
# Heatmap using occupancy (peak caller score) data
png(paste0(OUT, "/heatmap_occupancy.png"), width = 150, height = 150, units = "mm", res=300)
dba.plotHeatmap(db, main="Correlation heatmap\noccupancy data")
dev.off()
# plot Overlap rate
png(paste0(OUT, "/Overlap_rate_plot.png"), width = 150, height = 150, units = "mm", res=300)
olap.rate <- dba.overlap(db, mode=DBA_OLAP_RATE)
plot(olap.rate, type='b', ylab='# peaks', xlab='Overlap at least this many peaksets', main="Overlap rate plot")
dev.off()
## process DBA object
if(SUMMITS>0) {
# bUseSummarizeOverlaps method is slower and memory hungry, mandatory only for PE data.
# If summits is set, bUseSummarizeOverlaps must be set to FALSE (in DiffBind version before 3)
db <- dba.count(db, bUseSummarizeOverlaps=F, score=DBA_SCORE_TMM_MINUS_FULL, filter=FILTER, summits=SUMMITS)
} else {
db <- dba.count(db, bUseSummarizeOverlaps=PE, score=DBA_SCORE_TMM_MINUS_FULL, filter=FILTER)
# for bUseSummarizeOverlaps=T dba.count must be called without summits (summits=0, FALSE, NULL or NA doesn't work)
}
# retrieve dba object info
infodb <- dba.show(db)
infodb$Intervals <- NULL
infodb$Caller <- NULL
# apply the contrasts
for (i in 1:nrow(conts)) {
# parse formula in cont
cat("\nIn diffbind2 the mmatrix column is ignored, group column is used as Condition\n")
cont.name <- conts[i,1]
cont.form <- conts[i,2]
factors <- gsub("(^\\s+|\\s+$)", "", unlist(strsplit(cont.form,"\\W")))
factors <- factors[factors != ""]
c1 <- dba.mask(db, DBA_CONDITION, factors[1])
c2 <- dba.mask(db, DBA_CONDITION, factors[2])
db <- dba.contrast(db, group1=c1, group2=c2, name1=factors[1], name2=factors[2], categories=DBA_CONDITION)
}
# run the diffbind analysis (DESeq2) for all contrasts
db <- dba.analyze(db, bSubControl=SUBSTRACTCONTROL, bFullLibrarySize=FULLLIBRARYSIZE, bTagwise=TAGWISEDISPERSION)
# PCA plot
png(paste0(OUT, "/pca_plot_all_samples.png"), width = 150, height = 150, units = "mm", res=300)
dba.plotPCA(db, DBA_CONDITION, label=DBA_ID)
dev.off()
# Heatmap using affinity (read count) data
png(paste0(OUT, "/heatmap_affinity_scores.png"), width = 150, height = 150, units = "mm", res=300)
dba.plotHeatmap(db, main="Correlation heatmap\naffinity scores")
dev.off()
# Boxplot of consensus peaks
lpeaks <- db2$peaks # db2 contains group consensus peaksets additional to the sample peaksets
dfcolnames <- colnames(lpeaks[[1]]) # Consensus peak dfs don't have colnames, therefore name columns
lpeaks <- lapply(1:length(lpeaks), function(x) setNames(lpeaks[[x]], dfcolnames) )
names(lpeaks) <- names(db2$masks$Consensus) # name list elements
names(lpeaks)[db2$masks$Consensus] <- paste0("Consensus\n", names(lpeaks)[db2$masks$Consensus])
lpeaks <- lapply(lpeaks, function(x) { # calculate peak sizes
x$width <- abs(x[,grep("end", dfcolnames, ignore.case = T)] - x[,grep("start", dfcolnames, ignore.case = T)]) # columns names may upper or lower case
return(x)
})
for(x in unique(db2$samples$Condition)) { # concatenate peak sizes of samples from the same group
lpeaks[[paste0("Replicates\n", x)]] <- dplyr::bind_rows(lpeaks[which(db2$samples$Condition==x)])
}
lpeaks <- lpeaks[grepl("(Consensus)|(Replicates)", names(lpeaks))] # samples not needed anymore
plotdata <- dplyr::bind_rows(lpeaks, .id="group") # create data.frame for plotting
png(paste0(OUT, "/peak_width_boxplot.png"), width = 150, height = 150, units = "mm", res=300)
try(ggplot(plotdata, aes(x=group, y=width, fill=group)) + geom_boxplot(show.legend = FALSE) +
theme(text = element_text(size=15), axis.text.x = element_text(angle=90, vjust=0.5)) +
labs(title="Size of consensus peaks") + theme(plot.title = element_text(hjust = 0.5)) )
dev.off()
# Venn diagram of all contrasts
png(paste0(OUT, "/venn_plot_all_contrasts.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotVenn(db, main="Binding Site Overlaps Per Contrast", contrast=1:nrow(dba.show(db, bContrast=T))))
dev.off()
# prepare results and plots for each contrast
result <- lapply(1:nrow(conts), function(cont) {
cont.name <- conts[cont,1]
png(paste0(OUT, "/", cont.name, "_ma_plot.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotMA(db, contrast=cont, fold=FOLD))
dev.off()
png(paste0(OUT, "/", cont.name, "_boxplot_diff_sites.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotBox(db, contrast=cont)) # try, in case there are no significant peaks
dev.off()
png(paste0(OUT, "/", cont.name, "_volcano_plot.png"), width = 150, height = 150, units = "mm", res=300)
rep <- try(dba.report(db, contrast=cont, th=1)) # scale dot size by Conc (max dot size set to 4)
try(dba.plotVolcano(db, contrast=cont, fold=FOLD, dotSize=4*rep$Conc/max(rep$Conc)))
dev.off()
png(paste0(OUT, "/", cont.name, "_correlation_heatmap.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotHeatmap(db, contrast=cont)) # try, in case there are no significant peaks
dev.off()
png(paste0(OUT, "/", cont.name, "_pca_plot.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotPCA(db, contrast=cont, label=DBA_ID)) # try, in case there are no significant peaks
dev.off()
# plot consensus peaks
# 1st group name between diffbind versions
vennmask <- dba.mask(db, DBA_CONDITION, factor(dba.show(db, bContrast=T)[cont, grep("group", names(dba.show(db, bContrast=T)), ignore.case = T)]))
if(sum(vennmask) <= 4) { # if less than 2 replicates per group (or 4 replicates in total)
png(paste0(OUT, "/", cont.name, "_venn_plot.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotVenn(db, mask=vennmask, main="Binding Site Overlaps Per Sample")) # plot all relevant samples together
dev.off()
} else { # generate a consensus peakset otherwise
vennmask2 <- dba.mask(db2, DBA_CONDITION, factor(dba.show(db2, bContrast=T)[cont, grep("group", names(dba.show(db2, bContrast=T)), ignore.case = T)])) # db2 see above
png(paste0(OUT, "/", cont.name, "_venn_plot.png"), width = 150, height = 150, units = "mm", res=300)
try(dba.plotVenn(db2, main="Binding Site Overlaps Per Group",
mask=db2$masks$Consensus & names(db2$masks$Consensus) %in% factor(dba.show(db, bContrast=T)[cont, grep("group", names(dba.show(db, bContrast=T)), ignore.case = T)]) ))
dev.off()
}
tryCatch(dba.report(db, contrast=cont, bCalled=T, bUsePval=F, th=1, fold=0), # th=db$config$th, fold=FOLD # filtering is done later
error=function(e) NULL) # dba.report crashes if there is exactly 1 significant hit to report
})
##
## Annotate peaks
##
if(ANNOTATE) {
library(ChIPseeker)
result <- lapply(result, function(x) {
tryCatch({
x.ann <- annotatePeak(x, TxDb=txdb, annoDb=ANNODB, tssRegion=TSS, verbose=T)
plotAnnoBar(x.ann)
plotDistToTSS(x.ann)
x.ann
},
error=function(e) NULL
)
})
}
# create overview table with diffbind settings
diffbindSettings <- rbind(c(Parameter="DiffBind package version", Value=as.character(currentDiffbindVersion), Comment= paste(if(currentDiffbindVersion$major != floor(DIFFBINDVERSION)) {paste0("Major version not concordant with intended DiffBind v", DIFFBINDVERSION, ". Please check R module." )} else {""}, DiffBindWarningText)),
c("Fragment size", FRAGSIZE, ""),
c("Summits", SUMMITS, if(SUMMITS==0) {"no re-centering of peaks."} else {paste0("re-center peaks around consensus summit with peak width 2x", SUMMITS, ".", if(PE) {"'bUseSummarizeOverlaps' was set to FALSE."} else {""})}),
c("Filter threshold", FILTER, "threshold for filtering intervals with low read counts"),
c("Analysis method", ANALYSISMETHOD, ""),
c("Full library size", FULLLIBRARYSIZE, if(FULLLIBRARYSIZE){"total number of reads used for normalization"} else {"reads overlapping consensus peaks used for normalization"}),
c("Subtract control", SUBSTRACTCONTROL, "for each site subtract read counts from input controls"),
c("Tagwise dispersion", TAGWISEDISPERSION, "calculate dispersion tagwise"),
c("Design", "~group", "DiffBind2 does not allow for complex designs, mmatrix in contrast_diffbind.txt is ignored"),
c("FDR threshold", FDR_TRESHOLD, "significance threshold for differential binding analysis"),
c("Fold threshold", FOLD, "log Fold threshold for differential binding analysis")
)
writeLines(capture.output(sessionInfo()),paste(OUT, "/diffbind_session_info.txt", sep=""))
write.table(diffbindSettings, file=file.path(OUT, "diffbind_settings.txt"), row.names = F, quote = F, sep="\t")
write.table(infodb, file=file.path(OUT, "info_dba_object.txt"), row.names = F, quote = F, sep="\t")
result <- lapply(result, as.data.frame)
names(result) <- conts$contrast.name
write.xlsx(result, file=paste0(OUT, "/diffbind_all_sites.xlsx"))
result <- lapply(result, function(x) {x[x$FDR<=db$config$th & abs(x$Fold)>=FOLD, ]}) # filter result tables for significance
write.xlsx(result, file=paste0(OUT, "/diffbind.xlsx"))
saveRDS(result, file=paste0(OUT, "/diffbind.rds"))
dba.save(db, dir=OUT, file='diffbind', pre="")