In [1]:
format(Sys.time(), "%a %b %d %X %Y")

In [2]:
# limpio la memoria
rm(list=ls(all.names=TRUE)) # remove all objects
gc(full=TRUE, verbose=FALSE) # garbage collection

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,656952,35.1,1439211,76.9,1439211,76.9
Vcells,1225107,9.4,8388608,64.0,1924961,14.7


In [3]:
# Paso 1: Instalación de paquetes necesarios
if (!require("lightgbm")) install.packages("lightgbm")
if (!require("mlrMBO")) install.packages("mlrMBO")
if (!require("data.table")) install.packages("data.table")

library(lightgbm)
library(mlrMBO)
library(data.table)
library(parallel)
library(smoof)
library(mlr)


Loading required package: lightgbm

Loading required package: mlrMBO

Loading required package: mlr

Loading required package: ParamHelpers

Loading required package: smoof

Loading required package: checkmate

Loading required package: data.table



In [4]:
# Paso 2: Análisis exploratorio simple
iris_dt <- as.data.table(iris)
print(iris_dt[, .N, by = Species])

      Species     N
       <fctr> <int>
1:     setosa    50
2: versicolor    50
3:  virginica    50


In [5]:
# Paso 3: Dataset sin duplicar (baseline)
iris_base <- copy(iris_dt)
iris_base[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_base[, Species := NULL]

In [6]:
# Paso 4: Dataset duplicado (con ruido negativo)
iris_dup <- copy(iris_dt)
iris_dup[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_pos <- iris_dup[target == 1L]
iris_neg_copies <- copy(iris_pos)[, target := 0L]
iris_augmented <- rbindlist(list(iris_dup, iris_neg_copies))
iris_augmented[, Species := NULL]

start_time <- Sys.time()

In [7]:
# Paso 5: Función de entrenamiento y evaluación
train_evaluate <- function(data, seed = 123, iters = 10) {
  set.seed(seed)
  idx_train <- sample(1:nrow(data), 0.7 * nrow(data))
  train <- data[idx_train]
  test <- data[-idx_train]

  dtrain <- lgb.Dataset(data = data.matrix(train[, !"target", with = FALSE]), label = train$target)

  param_base <- list(objective = "binary",
                     metric = "auc",
                     verbosity = -1,
                     feature_pre_filter = FALSE)

  ps <- makeParamSet(
    makeNumericParam("learning_rate", lower = 0.01, upper = 0.2),
    makeIntegerParam("num_leaves", lower = 10L, upper = 200L),
    makeIntegerParam("min_data_in_leaf", lower = 5L, upper = 100L)
  )

  obj_fun <- makeSingleObjectiveFunction(
    fn = function(x) {
      x <- as.list(x)
      param <- modifyList(param_base, x)
      suppressMessages(suppressWarnings({
        cv <- lgb.cv(params = param,
                     data = dtrain,
                     nfold = 5,
                     nrounds = 100,
                     early_stopping_rounds = 10,
                     verbose = -1)
        return(max(unlist(cv$record_evals$valid$auc$eval)))
      }))
    },
    par.set = ps,
    minimize = FALSE
  )

  ctrl <- makeMBOControl()
  ctrl <- setMBOControlTermination(ctrl, iters = iters)
  ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())
  surrogate <- makeLearner("regr.km", predict.type = "se", config = list(on.par.without.desc = "quiet"))

  run <- suppressMessages(suppressWarnings(mbo(obj_fun, learner = surrogate, control = ctrl)))

  dbest <- suppressMessages(suppressWarnings(lgb.train(params = modifyList(param_base, as.list(run$x)),
                                                       data = dtrain,
                                                       nrounds = 100)))

  y_pred <- predict(dbest, data.matrix(test[, !"target", with = FALSE]))
  if (!require("pROC")) install.packages("pROC")
  library(pROC)
  auc_final <- auc(test$target, y_pred)

  list(run = run, auc = auc_final, predictions = y_pred, test = test)
}


In [8]:
# Paso 6: Ejecutar ambos experimentos
res_base <- train_evaluate(iris_base, iters = 30)


optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3634012 346 184 
  - best initial criterion value(s) :  7.781818 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -7.7818  |proj g|=     0.089329
At iterate     1  f =      -7.8513  |proj g|=      0.094818
At iterate     2  f =      -8.0839  |proj g|=      0.070258
At iterate     3  f =      -8.2248  |proj g|=     0.0083914
At iterate     4  f =       -8.226  |proj g|=     0.0062749
At iterate     5  f =      -8.2263  |proj g|=     0.0061795
At iterate     6  f =       -8.227  |proj g|=     0.0060759
At iterate     7  f =      -8.2288  |proj g|=     0.0068454
At iterate     8  f =      -8.2387  |proj g|=      0.017997
At iterate     

Loading required package: pROC

Type 'citation("pROC")' for a citation.


Attaching package: ‘pROC’


The following objects are masked from ‘package:stats’:

    cov, smooth, var


Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [9]:
res_augm <- train_evaluate(iris_augmented, iters = 30)


optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3441321 350 184 
  - best initial criterion value(s) :  11.12934 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -11.129  |proj g|=      0.36188
At iterate     1  f =      -11.325  |proj g|=        0.3869
ys=-7.615e-02  -gs= 1.572e-01, BFGS update SKIPPED
At iterate     2  f =      -14.221  |proj g|=       0.25808
At iterate     3  f =      -14.685  |proj g|=       0.10861
At iterate     4  f =       -14.75  |proj g|=      0.040846
At iterate     5  f =      -14.759  |proj g|=     0.0041089
At iterate     6  f =      -14.759  |proj g|=     0.0041106
At iterate     7  f =       -14.76  |proj g|=     0.0052991
At iterate     8  f =   

Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [10]:
params_log <- data.table(
  experimento = c("original", "duplicado"),
  learning_rate = c(res_base$run$x$learning_rate, res_augm$run$x$learning_rate),
  num_leaves = c(res_base$run$x$num_leaves, res_augm$run$x$num_leaves),
  min_data_in_leaf = c(res_base$run$x$min_data_in_leaf, res_augm$run$x$min_data_in_leaf),
  auc_train = c(as.numeric(res_base$auc), as.numeric(res_augm$auc))
)

pred_base_final <- res_base$predictions
pred_augm_final <- res_augm$predictions
auc_base_final <- auc(res_base$test$target, pred_base_final)
auc_augm_final <- auc(res_augm$test$target, pred_augm_final)

params_log[, auc_test := c(as.numeric(auc_base_final), as.numeric(auc_augm_final))]
print(params_log)

fwrite(params_log, "resumen_resultados.csv")

end_time <- Sys.time()
cat("Tiempo total de ejecución:", round(difftime(end_time, start_time, units = "secs"), 2), "segundos\n")

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases



   experimento learning_rate num_leaves min_data_in_leaf auc_train  auc_test
        <char>         <num>      <int>            <int>     <num>     <num>
1:    original     0.1999929        143               39 0.9735577 0.9735577
2:   duplicado     0.0416827        200               48 0.8403110 0.8403110
Tiempo total de ejecución: 46.61 segundos
