-
Notifications
You must be signed in to change notification settings - Fork 12
/
machinelearning-functions-ksvm.R
249 lines (232 loc) · 9.58 KB
/
machinelearning-functions-ksvm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
##' Classification parameter optimisation for the support vector
##' machine algorithm.
##'
##' Note that when performance scores precision, recall and (macro) F1
##' are calculated, any NA values are replaced by 0. This decision is
##' motivated by the fact that any class that would have either a NA
##' precision or recall would result in an NA F1 score and,
##' eventually, a NA macro F1 (i.e. mean(F1)). Replacing NAs by 0s
##' leads to F1 values of 0 and a reduced yet defined final macro F1
##' score.
##'
##' @title ksvm parameter optimisation
##' @param object An instance of class \code{"\linkS4class{MSnSet}"}.
##' @param fcol The feature meta-data containing marker definitions.
##' Default is \code{markers}.
##' @param cost The hyper-parameter. Default values are \code{2^-4:4}.
##' @param times The number of times internal cross-validation is performed.
##' Default is 100.
##' @param test.size The size of test data. Default is 0.2 (20 percent).
##' @param xval The \code{n}-cross validation. Default is 5.
##' @param fun The function used to summarise the \code{xval} macro F1 matrices.
##' @param seed The optional random number generator seed.
##' @param verbose A \code{logical} defining whether a progress bar is displayed.
##' @param ... Additional parameters passed to \code{\link{ksvm}} from package \code{kernlab}.
##' @return An instance of class \code{"\linkS4class{GenRegRes}"}.
##' @seealso \code{\link{ksvmClassification}} and example therein.
##' @aliases ksvmRegularisation ksvmOptimization
##' @author Laurent Gatto
ksvmOptimisation <- function(object,
fcol = "markers",
cost = 2^(-4:4),
times = 100,
test.size = .2,
xval = 5,
fun = mean,
seed,
verbose = TRUE,
...) {
nparams <- 1 ## 2 or 1, depending on the algorithm
mydata <- subsetAsDataFrame(object, fcol, train = TRUE)
if (missing(seed)) {
seed <- sample(.Machine$integer.max, 1)
}
.seed <- as.integer(seed)
set.seed(.seed)
tmp <- tempfile() ## to get rid of ksvm message
con <- file(tmp, open = "wt")
on.exit({
close(con)
unlink(tmp)
})
## initialise output
.warnings <- NULL
.f1Matrices <- vector("list", length = times)
.testPartitions <- .cmMatrices <- vector("list", length = times) ## NEW
.results <- matrix(NA, nrow = times, ncol = nparams + 1)
colnames(.results) <- c("F1", "cost")
if (verbose) {
pb <- txtProgressBar(min = 0,
max = xval * times,
style = 3)
._k <- 0
}
for (.times in 1:times) {
.size <- ceiling(table(mydata$markers) * test.size)
## size is ordered according to levels, but strata
## expects them to be ordered as they appear in the data
.size <- .size[unique(mydata$markers)]
test.idx <- strata(mydata, "markers",
size = .size,
method = "srswor")$ID_unit
.testPartitions[[.times]] <- test.idx ## NEW
.test1 <- mydata[ test.idx, ] ## 'unseen' test set
.train1 <- mydata[-test.idx, ] ## to be used for parameter optimisation
xfolds <- createFolds(.train1$markers, xval, returnTrain = TRUE)
## stores the xval F1 matrices
.matrixF1L <- vector("list", length = xval)
for (.xval in 1:xval) {
if (verbose) {
setTxtProgressBar(pb, ._k)
._k <- ._k + 1
}
.train2 <- .train1[ xfolds[[.xval]], ]
.test2 <- .train1[-xfolds[[.xval]], ]
## The second argument in makeF1matrix will be
## used as rows, the first one for columns
.matrixF1 <- makeF1matrix(list(cost = cost))
## grid search for parameter selection
for (.cost in cost) {
sink(con)
model <- ksvm(markers ~ ., .train2, C = .cost, ...)
sink()
ans <- kernlab::predict(model, .test2, type = "response")
conf <- confusionMatrix(ans, .test2$markers)$table
.p <- checkNumbers(MLInterfaces:::.precision(conf))
.r <- checkNumbers(MLInterfaces:::.recall(conf))
.f1 <- MLInterfaces:::.macroF1(.p, .r, naAs0 = TRUE)
.matrixF1[1, as.character(.cost)] <- .f1
}
## we have a complete grid to be saved
.matrixF1L[[.xval]] <- .matrixF1
}
## we have xval grids to be summerised
.summaryF1 <- summariseMatList(.matrixF1L, fun)
.f1Matrices[[.times]] <- .summaryF1
.bestParams <- getBestParams(.summaryF1)[1:nparams, 1] ## takes a random best param
sink(con)
model <- ksvm(markers ~ ., .train1, C = .bestParams["cost"], ...)
sink()
ans <- kernlab::predict(model, .test1, type = "response")
.cmMatrices[[.times]] <- conf <- confusionMatrix(ans, .test1$markers)$table ## NEW
p <- checkNumbers(MLInterfaces:::.precision(conf),
tag = "precision", params = .bestParams)
r <- checkNumbers(MLInterfaces:::.recall(conf),
tag = "recall", params = .bestParams)
f1 <- MLInterfaces:::.macroF1(p, r, naAs0 = TRUE) ## macro F1 score for .time's iteration
.results[.times, ] <- c(f1, .bestParams["cost"])
}
if (verbose) {
setTxtProgressBar(pb, ._k)
close(pb)
}
.hyperparams <- list(cost = cost)
.design <- c(xval = xval,
test.size = test.size,
times = times)
ans <- new("GenRegRes",
algorithm = "ksvm",
seed = .seed,
hyperparameters = .hyperparams,
design = .design,
results = .results,
f1Matrices = .f1Matrices,
cmMatrices = .cmMatrices, ## NEW
testPartitions = .testPartitions, ## NEW
datasize = list(
"data" = dim(mydata),
"data.markers" = table(mydata[, "markers"]),
"train1" = dim(.train1),
"test1" = dim(.test1),
"train1.markers" = table(.train1[, "markers"]),
"train2" = dim(.train2),
"test2" = dim(.test2),
"train2.markers" = table(.train2[, "markers"])))
if (!is.null(.warnings)) {
ans@log <- list(warnings = .warnings)
sapply(.warnings, warning)
}
return(ans)
}
ksvmOptimization <-
ksvmOptimisation
ksvmRegularisation <- function(...) {
.Deprecated(msg = "This function has been replaced by 'ksvmOptimisation'.")
ksvmOptimisation(...)
}
##' Classification using the support vector
##' machine algorithm.
##'
##' @title ksvm classification
##' @param object An instance of class \code{"\linkS4class{MSnSet}"}.
##' @param assessRes An instance of class
##' \code{"\linkS4class{GenRegRes}"}, as generated by
##' \code{\link{ksvmOptimisation}}.
##' @param scores One of \code{"prediction"}, \code{"all"} or
##' \code{"none"} to report the score for the predicted class
##' only, for all classes or none.
##' @param cost If \code{assessRes} is missing, a \code{cost} must be
##' provided.
##' @param fcol The feature meta-data containing marker definitions.
##' Default is \code{markers}.
##' @param ... Additional parameters passed to \code{\link{ksvm}} from
##' package \code{kernlab}.
##' @return An instance of class \code{"\linkS4class{MSnSet}"} with
##' \code{ksvm} and \code{ksvm.scores} feature variables storing
##' the classification results and scores respectively.
##' @author Laurent Gatto
##' @aliases ksvmPrediction
##' @examples
##' library(pRolocdata)
##' data(dunkley2006)
##' ## reducing parameter search space and iterations
##' params <- ksvmOptimisation(dunkley2006, cost = 2^seq(-1,4,5), times = 3)
##' params
##' plot(params)
##' f1Count(params)
##' levelPlot(params)
##' getParams(params)
##' res <- ksvmClassification(dunkley2006, params)
##' getPredictions(res, fcol = "ksvm")
##' getPredictions(res, fcol = "ksvm", t = 0.75)
##' plot2D(res, fcol = "ksvm")
ksvmClassification <- function(object,
assessRes,
scores = c("prediction", "all", "none"),
cost,
fcol = "markers",
...) {
scores <- match.arg(scores)
if (missing(assessRes)) {
if (missing(cost))
stop("First run 'ksvmOptimisation' or set 'cost' manually.")
params <- c("cost" = cost)
} else {
params <- getParams(assessRes)
if (is.na(params["cost"]))
stop("No 'cost' found.")
}
trainInd <- which(fData(object)[, fcol] != "unknown")
form <- as.formula(paste0(fcol, " ~ ."))
ans <- MLearn(form, t(object), ksvmI, trainInd,
C = params["cost"], ...)
fData(object)$ksvm <- predictions(ans)
if (scores == "all") {
scoreMat <- predScores(ans)
colnames(scoreMat) <- paste0(colnames(scoreMat), ".ksvm.scores")
fData(object)$svm.all.scores <- scoreMat
} else if (scores == "prediction") {
fData(object)$ksvm.scores <- predScore(ans)
} ## else scores is "none"
object@processingData@processing <- c(processingData(object)@processing,
paste0("Performed ksvm prediction (",
paste(paste(names(params), params, sep = "="),
collapse = " "), ") ",
date()))
if (validObject(object))
return(object)
}
ksvmPrediction <- function(...) {
.Deprecated(msg = "This function has been replaced by 'ksvmClassification'.")
ksvmClassification(...)
}