Skip to content

Commit

Permalink
tune impact boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
ja-thomas committed May 7, 2018
1 parent 8277d81 commit 43ba777
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 34 deletions.
48 changes: 29 additions & 19 deletions R/autoxgboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
#' Default is \code{4/5}.
#' @param design.size [\code{integer(1)}]\cr
#' Size of the initial design. Default is \code{15L}.
#' @param impact.encoding.boundary [\code{integer(1)}]\cr
#' Defines the threshold on how factor variables are handled. Factors with more levels than the \code{"impact.encoding.boundary"} get impact encoded while factor variables with less or equal levels than the \code{"impact.encoding.boundary"} get dummy encoded.
#' @param upper.impact.encoding.boundary [\code{integer(1)}]\cr
#' Defines the upper bound for tuning the threshold on how factor variables are handled. Factors with more levels than the \code{"impact.encoding.boundary"} get impact encoded while factor variables with less or equal levels than the \code{"impact.encoding.boundary"} get dummy encoded.
#' For \code{impact.encoding.boundary = 0L}, all factor variables get impact encoded while for \code{impact.encoding.boundary = .Machine$integer.max}, all of them get dummy encoded.
#' Default is \code{10}.
#' @param mbo.learner [\code{\link[mlr]{Learner}}]\cr
Expand All @@ -62,7 +62,7 @@
#' }
autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L, time.budget = 3600L,
par.set = NULL, max.nrounds = 10^6, early.stopping.rounds = 10L, early.stopping.fraction = 4/5,
build.final.model = TRUE, design.size = 15L, impact.encoding.boundary = 10L, mbo.learner = NULL,
build.final.model = TRUE, design.size = 15L, upper.impact.encoding.boundary = 10L, mbo.learner = NULL,
nthread = NULL, tune.threshold = TRUE) {


Expand All @@ -78,7 +78,7 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,
assertNumeric(early.stopping.fraction, lower = 0, upper = 1, len = 1L)
assertFlag(build.final.model)
assertIntegerish(design.size, lower = 1L, len = 1L)
assertIntegerish(impact.encoding.boundary, lower = 0, len = 1L)
assertIntegerish(upper.impact.encoding.boundary, lower = 0, len = 1L)
assertIntegerish(nthread, lower = 1, len = 1L, null.ok = TRUE)
assertFlag(tune.threshold)

Expand Down Expand Up @@ -129,36 +129,39 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,
stop("Task must be regression or classification")
}

# Create pipeline

preproc.pipeline = NULLCPO

#if (!is.null(task$feature.information$timestamps))
# preproc.pipeline %<>>% cpoExtractTimeStampInformation(affect.names = unlist(task$feature.information$timestamps))
if (has.cat.feats) {
preproc.pipeline %<>>% generateCatFeatPipeline(task, impact.encoding.boundary)
par.set = c(par.set, makeParamSet(makeIntegerParam("impact.encoding.boundary", lower = 1, upper = upper.impact.encoding.boundary)))
}

preproc.pipeline %<>>% cpoDropConstants()


# process data and apply pipeline

# split early stopping data
rinst = makeResampleInstance(makeResampleDesc("Holdout", split = early.stopping.fraction), task)
task.test = subsetTask(task, rinst$test.inds[[1]])
task.train = subsetTask(task, rinst$train.inds[[1]])

task.train %<>>% preproc.pipeline
task.test %<>>% retrafo(task.train)
base.learner = setHyperPars(base.learner, early.stopping.data = task.test)

# Optimize

opt = smoof::makeSingleObjectiveFunction(name = "optimizeWrapper",
fn = function(x) {

# Create pipeline

preproc.pipeline = NULLCPO

#if (!is.null(task$feature.information$timestamps))
# preproc.pipeline %<>>% cpoExtractTimeStampInformation(affect.names = unlist(task$feature.information$timestamps))
if (has.cat.feats) {
preproc.pipeline %<>>% generateCatFeatPipeline(task, x$impact.encoding.boundary)
x$impact.encoding.boundary = NULL
}

preproc.pipeline %<>>% cpoDropConstants()
task.train %<>>% preproc.pipeline
task.test %<>>% retrafo(task.train)

x = x[!vlapply(x, is.na)]
lrn = setHyperPars(base.learner, par.vals = x)
lrn = setHyperPars(base.learner, early.stopping.data = task.test, par.vals = x)
mod = train(lrn, task.train)
pred = predict(mod, task.test)
nrounds = getBestIteration(mod)
Expand All @@ -181,6 +184,13 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,

optim.result = mbo(fun = opt, control = control, design = des, learner = mbo.learner)

preproc.pipeline = NULLCPO

if (has.cat.feats) {
preproc.pipeline %<>>% generateCatFeatPipeline(task, optim.result$x$impact.encoding.boundary)
}

preproc.pipeline %<>>% cpoDropConstants()

lrn = buildFinalLearner(optim.result, objective, predict.type, par.set = par.set, preproc.pipeline = preproc.pipeline)

Expand Down
2 changes: 1 addition & 1 deletion R/buildFinalLearner.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Create xgboost learner based on the optimization result
buildFinalLearner = function(optim.result, objective, predict.type = NULL, par.set, preproc.pipeline) {

nrounds = getBestNrounds(optim.result)
pars = trafoValue(par.set, optim.result$x)
pars$impact.encoding.boundary = NULL
pars = pars[!vlapply(pars, is.na)]
lrn = if (!is.null(predict.type)) {
makeLearner("classif.xgboost.custom", nrounds = nrounds, objective = objective,
Expand Down
8 changes: 4 additions & 4 deletions man/autoxgboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 10 additions & 10 deletions tests/testthat/test_autoxgboost.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
checkAutoxgboost = function(task, build.final.model, impact.encoding.boundary, control, mbo.learner, tune.threshold) {
checkAutoxgboost = function(task, build.final.model, upper.impact.encoding.boundary, control, mbo.learner, tune.threshold) {
r = autoxgboost(task, build.final.model = build.final.model, max.nrounds = 1L,
impact.encoding.boundary = impact.encoding.boundary, control = control,
upper.impact.encoding.boundary = upper.impact.encoding.boundary, control = control,
mbo.learner = mbo.learner, nthread = 1, tune.threshold = tune.threshold)
td = getTaskDesc(task)

Expand Down Expand Up @@ -35,9 +35,9 @@ test_that("autoxgboost works on different tasks", {
iris.fac
)

for (im in c(0L, 2L, .Machine$integer.max)) {
for (im in c(1L, 2L, 10L)) {
for (t in tasks) {
checkAutoxgboost(task = t, build.final.model = TRUE, impact.encoding.boundary = im,
checkAutoxgboost(task = t, build.final.model = TRUE, upper.impact.encoding.boundary = im,
control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
}
}
Expand All @@ -46,32 +46,32 @@ test_that("autoxgboost works on different tasks", {

context("Thresholds")
test_that("autoxgboost thresholding works", {
checkAutoxgboost(task = sonar.task, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
checkAutoxgboost(task = sonar.task, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
control = ctrl, mbo.learner = mbo.learner, tune.threshold = TRUE)
#FIXME: Wait for faster multiclass threshold tuning in mlr
#checkAutoxgboost(task = iris.task, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
#checkAutoxgboost(task = iris.task, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
# control = ctrl, mbo.learner = mbo.learner, tune.threshold = TRUE)
})

#context("Weights")
#test_that("weights work", {
# iris.weighted = makeClassifTask(data = iris, target = "Species", weights = sample(c(1,20), 150, replace = TRUE))
# bh.weighted = makeRegrTask(data = getTaskData(bh.task)[1:50, -4], target = "medv", weights = sample(c(1,20), 50, replace = TRUE))
# checkAutoxgboost(task = iris.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
# checkAutoxgboost(task = bh.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
# checkAutoxgboost(task = iris.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, upper.impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
# checkAutoxgboost(task = bh.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, upper.impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
#})

#context("Timestamps")
#test_that("Timestamps work", {
# iris.time = addFeatureInformation(iris.time, "timestamps", "time1")
# checkAutoxgboost(task = iris.time, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
# checkAutoxgboost(task = iris.time, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
# control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
#})
#
#context("Featurehashing")
#test_that("Featurehashing work", {
# iris.fac = addFeatureInformation(iris.fac, "categ.featuresets", c("bla", "bla2"))
# checkAutoxgboost(task = iris.fac, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
# checkAutoxgboost(task = iris.fac, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
# control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
#})

Expand Down

0 comments on commit 43ba777

Please sign in to comment.