-
Notifications
You must be signed in to change notification settings - Fork 32
/
segmentationGATK4.R
145 lines (142 loc) · 6.37 KB
/
segmentationGATK4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#' GATK4 ModelSegments segmentation function
#'
#' A wrapper for GATK4s ModelSegmentation function, useful when normalization
#' is performed with other tools than GATK4, for example PureCN.
#' This function is called via the
#' \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The
#' arguments are passed via \code{args.segmentation}.
#'
#'
#' @param normal Coverage data for normal sample. Ignored in this function.
#' @param tumor Coverage data for tumor sample.
#' @param log.ratio Copy number log-ratios, one for each exon in coverage file.
#' @param seg If segmentation was provided by the user, this data structure
#' will contain this segmentation. Useful for minimal segmentation functions.
#' Otherwise PureCN will re-segment the data. This segmentation function
#' ignores this user provided segmentation.
#' @param vcf Optional \code{CollapsedVCF} object with germline allelic ratios.
#' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
#' VCF.
#' @param normal.id.in.vcf Id of normal in in VCF. Currently not used.
#' @param min.logr.sdev Minimum log-ratio standard deviation used in the
#' model. Useful to make fitting more robust to outliers in very clean
#' data.
#' @param prune.hclust.h Ignored in this function.
#' @param prune.hclust.method Ignored in this function.
#' @param changepoints.penality The \code{--number-of-changepoints-penalty-factor}.
#' If \code{NULL}, find a sensible default. Ignored when provided in
#' \code{additional.cmd.args}.
#' @param additional.cmd.args \code{character(1)}. By default,
#' \code{ModelSegments} is called with default parameters. Provide additional
#' arguments here.
#' @param chr.hash Not needed here since \code{ModelSegments} does not
#' require numbered chromosome names.
#' @param ... Currently unused arguments provided to other segmentation
#' functions.
#' @return \code{data.frame} containing the segmentation.
#' @author Markus Riester
#'
#' @seealso \code{\link{runAbsoluteCN}}
#' @examples
#'
#' normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt",
#' package="PureCN")
#' tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt",
#' package="PureCN")
#' vcf.file <- system.file("extdata", "example.vcf.gz",
#' package="PureCN")
#'
#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
#' # non-default values to speed-up this example. This is not a good idea for real
#' # samples.
#'\dontrun{
#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file,
#' tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file,
#' sampleid="Sample1", genome="hg19",
#' fun.segmentation = segmentationGATK4, max.ploidy=4,
#' args.segmentation = list(additional.cmd.args = "--gcs-max-retries 19"),
#' test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
#'}
#'
#' @importFrom utils compareVersion
#' @export segmentationGATK4
segmentationGATK4 <- function(normal, tumor, log.ratio, seg,
vcf = NULL, tumor.id.in.vcf = 1, normal.id.in.vcf = NULL,
min.logr.sdev = 0.15, prune.hclust.h = NULL, prune.hclust.method = NULL,
changepoints.penality = NULL, additional.cmd.args = "",
chr.hash = NULL, ...) {
min.version <- "4.1.7.0"
if (.checkGATK4Version(min.version) < 0) {
.stopUserError("Unable to find a recent (>= %s) gatk binary.",
min.version)
}
.checkParametersSegmentation(alpha = NULL, undo.SD = NULL,
max.segments = NULL, min.logr.sdev = min.logr.sdev,
prune.hclust.h = prune.hclust.h)
output.vcf.file <- NULL
if (!is.null(vcf)) {
output.vcf.file <- tempfile(fileext = ".tsv")
.writeAllelicCountsFileGatk(vcf, tumor.id.in.vcf, output.vcf.file)
}
if (length(log.ratio) != length(tumor)) {
.stopRuntimeError("tumor and log.ratio do not align in segmentationGATK4.")
}
output.lr.file <- tempfile(fileext = ".tsv")
# following .writeLogRatioFileGATK4 needs the GRanges information, so
# simply update it (should be the same though!)
tumor$log.ratio <- log.ratio
.writeLogRatioFileGATK4(list(vcf=vcf, log.ratio = tumor), tumor.id.in.vcf, output.lr.file)
output.dir <- tempdir()
sampleid <- .getSampleIdFromVcf(vcf, tumor.id.in.vcf)
changepoints.arg <- ""
if (!grepl("number-of-changepoints-penalty-factor",
additional.cmd.args)[1]) {
if (is.null(changepoints.penality)) {
changepoints.penality <- .getSDundo(log.ratio, min.logr.sdev)
}
flog.info("Setting --number-of-changepoints-penalty-factor to %f.",
changepoints.penality)
changepoints.arg <- paste("--number-of-changepoints-penalty-factor",
changepoints.penality)
}
args <- paste("ModelSegments",
"--allelic-counts", output.vcf.file,
"--denoised-copy-ratios", output.lr.file,
"--output-prefix", "tumor",
changepoints.arg,
additional.cmd.args,
"-O", output.dir)
output <- try(system2("gatk", args, stderr = TRUE, stdout = TRUE))
if (is(output, "try-error")) {
.stopUserError("GATK4 ModelSegments runtime error: ",
output, "\nArguments: ", args)
}
seg.file <- file.path(output.dir, "tumor.modelFinal.seg")
seg <- readSegmentationFile(seg.file, sampleid)
idx.enough.markers <- seg$num.mark > 1
rownames(seg) <- NULL
attr(seg, "ModelSegments.Output") <- readChar(seg.file, file.info(seg.file)$size)
attr(seg, "ModelSegments.Log") <- output
attr(seg, "ModelSegments.Args") <- args
file.remove(output.vcf.file)
file.remove(output.lr.file)
seg[idx.enough.markers,]
}
.getSampleIdFromVcf <- function(vcf, tumor.id.in.vcf) {
if (is(tumor.id.in.vcf, "character") &&
tumor.id.in.vcf %in% samples(header(vcf))) return(tumor.id.in.vcf)
if (is(tumor.id.in.vcf, "numeric") &&
length(samples(header(vcf))) <= tumor.id.in.vcf) {
return(samples(header(vcf))[tumor.id.in.vcf])
}
return(NA)
}
.checkGATK4Version <- function(min.version) {
output <- try(system2("gatk", "--version", stdout=TRUE), silent = TRUE)
if (is(output, "try-error")) {
flog.warn("Cannot find gatk binary in path.")
return(-1)
}
prefix <- "The Genome Analysis Toolkit (GATK) v"
compareVersion(gsub(prefix, "", output[grep(prefix, output, fixed = TRUE)], fixed = TRUE), min.version)
}