tune impact boundary

ja-thomas · May 7, 2018 · 43ba777 · 43ba777
1 parent 8277d81
commit 43ba777
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 34 deletions.
diff --git a/R/autoxgboost.R b/R/autoxgboost.R
@@ -37,8 +37,8 @@
 #'   Default is \code{4/5}.
 #' @param design.size [\code{integer(1)}]\cr
 #'   Size of the initial design. Default is \code{15L}.
-#' @param impact.encoding.boundary [\code{integer(1)}]\cr
-#'   Defines the threshold on how factor variables are handled. Factors with more levels than the \code{"impact.encoding.boundary"} get impact encoded while factor variables with less or equal levels than the \code{"impact.encoding.boundary"} get dummy encoded.
+#' @param upper.impact.encoding.boundary [\code{integer(1)}]\cr
+#'   Defines the upper bound for tuning the threshold on how factor variables are handled. Factors with more levels than the \code{"impact.encoding.boundary"} get impact encoded while factor variables with less or equal levels than the \code{"impact.encoding.boundary"} get dummy encoded.
 #'   For \code{impact.encoding.boundary = 0L}, all factor variables get impact encoded while for \code{impact.encoding.boundary = .Machine$integer.max}, all of them get dummy encoded.
 #'   Default is \code{10}.
 #' @param mbo.learner [\code{\link[mlr]{Learner}}]\cr
@@ -62,7 +62,7 @@
 #' }
 autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L, time.budget = 3600L,
   par.set = NULL, max.nrounds = 10^6, early.stopping.rounds = 10L, early.stopping.fraction = 4/5,
-  build.final.model = TRUE, design.size = 15L, impact.encoding.boundary = 10L, mbo.learner = NULL,
+  build.final.model = TRUE, design.size = 15L, upper.impact.encoding.boundary = 10L, mbo.learner = NULL,
   nthread = NULL, tune.threshold = TRUE) {
 
 
@@ -78,7 +78,7 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,
   assertNumeric(early.stopping.fraction, lower = 0, upper = 1, len = 1L)
   assertFlag(build.final.model)
   assertIntegerish(design.size, lower = 1L, len = 1L)
-  assertIntegerish(impact.encoding.boundary, lower = 0, len = 1L)
+  assertIntegerish(upper.impact.encoding.boundary, lower = 0, len = 1L)
   assertIntegerish(nthread, lower = 1, len = 1L, null.ok = TRUE)
   assertFlag(tune.threshold)
 
@@ -129,36 +129,39 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,
     stop("Task must be regression or classification")
   }
 
-  # Create pipeline
 
-  preproc.pipeline = NULLCPO
 
-  #if (!is.null(task$feature.information$timestamps))
-  #  preproc.pipeline %<>>% cpoExtractTimeStampInformation(affect.names = unlist(task$feature.information$timestamps))
   if (has.cat.feats) {
-    preproc.pipeline %<>>% generateCatFeatPipeline(task, impact.encoding.boundary)
+    par.set = c(par.set, makeParamSet(makeIntegerParam("impact.encoding.boundary", lower = 1, upper = upper.impact.encoding.boundary)))
   }
 
-  preproc.pipeline %<>>% cpoDropConstants()
-
-
-  # process data and apply pipeline
-
   # split early stopping data
   rinst = makeResampleInstance(makeResampleDesc("Holdout", split = early.stopping.fraction), task)
   task.test = subsetTask(task, rinst$test.inds[[1]])
   task.train = subsetTask(task, rinst$train.inds[[1]])
 
-  task.train %<>>% preproc.pipeline
-  task.test %<>>% retrafo(task.train)
-  base.learner = setHyperPars(base.learner, early.stopping.data = task.test)
-
   # Optimize
 
   opt = smoof::makeSingleObjectiveFunction(name = "optimizeWrapper",
     fn = function(x) {
+
+      # Create pipeline
+
+      preproc.pipeline = NULLCPO
+
+      #if (!is.null(task$feature.information$timestamps))
+      #  preproc.pipeline %<>>% cpoExtractTimeStampInformation(affect.names = unlist(task$feature.information$timestamps))
+      if (has.cat.feats) {
+        preproc.pipeline %<>>% generateCatFeatPipeline(task, x$impact.encoding.boundary)
+        x$impact.encoding.boundary = NULL
+      }
+
+      preproc.pipeline %<>>% cpoDropConstants()
+      task.train %<>>% preproc.pipeline
+      task.test %<>>% retrafo(task.train)
+
       x = x[!vlapply(x, is.na)]
-      lrn = setHyperPars(base.learner, par.vals = x)
+      lrn = setHyperPars(base.learner, early.stopping.data = task.test, par.vals = x)
       mod = train(lrn, task.train)
       pred = predict(mod, task.test)
       nrounds = getBestIteration(mod)
@@ -181,6 +184,13 @@ autoxgboost = function(task, measure = NULL, control = NULL, iterations = 160L,
 
   optim.result = mbo(fun = opt, control = control, design = des, learner = mbo.learner)
 
+  preproc.pipeline = NULLCPO
+
+  if (has.cat.feats) {
+    preproc.pipeline %<>>% generateCatFeatPipeline(task, optim.result$x$impact.encoding.boundary)
+  }
+
+  preproc.pipeline %<>>% cpoDropConstants()
 
   lrn = buildFinalLearner(optim.result, objective, predict.type, par.set = par.set, preproc.pipeline = preproc.pipeline)
 

diff --git a/R/buildFinalLearner.R b/R/buildFinalLearner.R
@@ -1,8 +1,8 @@
 # Create xgboost learner based on the optimization result
 buildFinalLearner = function(optim.result, objective, predict.type = NULL, par.set, preproc.pipeline) {
-
   nrounds = getBestNrounds(optim.result)
   pars = trafoValue(par.set, optim.result$x)
+  pars$impact.encoding.boundary = NULL
   pars = pars[!vlapply(pars, is.na)]
   lrn = if (!is.null(predict.type)) {
     makeLearner("classif.xgboost.custom", nrounds = nrounds, objective = objective,

diff --git a/man/autoxgboost.Rd b/man/autoxgboost.Rd
diff --git a/tests/testthat/test_autoxgboost.R b/tests/testthat/test_autoxgboost.R
@@ -1,6 +1,6 @@
-checkAutoxgboost = function(task, build.final.model, impact.encoding.boundary, control, mbo.learner, tune.threshold) {
+checkAutoxgboost = function(task, build.final.model, upper.impact.encoding.boundary, control, mbo.learner, tune.threshold) {
     r = autoxgboost(task, build.final.model = build.final.model, max.nrounds = 1L,
-      impact.encoding.boundary = impact.encoding.boundary, control = control,
+      upper.impact.encoding.boundary = upper.impact.encoding.boundary, control = control,
       mbo.learner = mbo.learner, nthread = 1, tune.threshold = tune.threshold)
     td = getTaskDesc(task)
 
@@ -35,9 +35,9 @@ test_that("autoxgboost works on different tasks",  {
     iris.fac
   )
 
-  for (im in c(0L, 2L, .Machine$integer.max)) {
+  for (im in c(1L, 2L, 10L)) {
     for (t in tasks) {
-      checkAutoxgboost(task = t, build.final.model = TRUE, impact.encoding.boundary = im,
+      checkAutoxgboost(task = t, build.final.model = TRUE, upper.impact.encoding.boundary = im,
         control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
     }
   }
@@ -46,32 +46,32 @@ test_that("autoxgboost works on different tasks",  {
 
 context("Thresholds")
 test_that("autoxgboost thresholding works",  {
-  checkAutoxgboost(task = sonar.task, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
+  checkAutoxgboost(task = sonar.task, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
     control = ctrl, mbo.learner = mbo.learner, tune.threshold = TRUE)
   #FIXME: Wait for faster multiclass threshold tuning in mlr
-  #checkAutoxgboost(task = iris.task, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
+  #checkAutoxgboost(task = iris.task, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
   #  control = ctrl, mbo.learner = mbo.learner, tune.threshold = TRUE)
 })
 
 #context("Weights")
 #test_that("weights work", {
 #  iris.weighted = makeClassifTask(data = iris, target = "Species", weights = sample(c(1,20), 150, replace = TRUE))
 #  bh.weighted = makeRegrTask(data = getTaskData(bh.task)[1:50, -4], target = "medv", weights = sample(c(1,20), 50, replace = TRUE))
-#  checkAutoxgboost(task = iris.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
-#  checkAutoxgboost(task = bh.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
+#  checkAutoxgboost(task = iris.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, upper.impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
+#  checkAutoxgboost(task = bh.weighted, build.final.model = FALSE, mbo.learner = mbo.learner, upper.impact.encoding.boundary = .Machine$integer.max, control = ctrl, tune.threshold = FALSE)
 #})
 
 #context("Timestamps")
 #test_that("Timestamps work", {
 #    iris.time = addFeatureInformation(iris.time, "timestamps", "time1")
-#    checkAutoxgboost(task = iris.time, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
+#    checkAutoxgboost(task = iris.time, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
 #    control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
 #})
 #
 #context("Featurehashing")
 #test_that("Featurehashing work", {
 #    iris.fac = addFeatureInformation(iris.fac, "categ.featuresets", c("bla", "bla2"))
-#    checkAutoxgboost(task = iris.fac, build.final.model = TRUE, impact.encoding.boundary = .Machine$integer.max,
+#    checkAutoxgboost(task = iris.fac, build.final.model = TRUE, upper.impact.encoding.boundary = .Machine$integer.max,
 #    control = ctrl, mbo.learner = mbo.learner, tune.threshold = FALSE)
 #})