In [None]:
format(Sys.time(), "%a %b %d %X %Y")

In [None]:
# limpio la memoria
rm(list=ls(all.names=TRUE)) # remove all objects
gc(full=TRUE, verbose=FALSE) # garbage collection

In [None]:
# Paso 1: Instalación de paquetes necesarios
if (!require("lightgbm")) install.packages("lightgbm")
if (!require("mlrMBO")) install.packages("mlrMBO")
if (!require("data.table")) install.packages("data.table")

library(lightgbm)
library(mlrMBO)
library(data.table)
library(parallel)
library(smoof)
library(mlr)


In [None]:
# Paso 2: Análisis exploratorio simple
iris_dt <- as.data.table(iris)
print(iris_dt[, .N, by = Species])

In [None]:
# Paso 3: Dataset sin duplicar (baseline)
iris_base <- copy(iris_dt)
iris_base[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_base[, Species := NULL]

In [None]:
# Paso 4: Dataset duplicado (con ruido negativo)
iris_dup <- copy(iris_dt)
iris_dup[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_pos <- iris_dup[target == 1L]
iris_neg_copies <- copy(iris_pos)[, target := 0L]
iris_augmented <- rbindlist(list(iris_dup, iris_neg_copies))
iris_augmented[, Species := NULL]

start_time <- Sys.time()

In [None]:
# Paso 5: Función de entrenamiento y evaluación
train_evaluate <- function(data, seed = 123, iters = 10) {
  set.seed(seed)
  idx_train <- sample(1:nrow(data), 0.7 * nrow(data))
  train <- data[idx_train]
  test <- data[-idx_train]

  dtrain <- lgb.Dataset(data = data.matrix(train[, !"target", with = FALSE]), label = train$target)

  param_base <- list(objective = "binary",
                     metric = "auc",
                     verbosity = -1,
                     feature_pre_filter = FALSE)

  ps <- makeParamSet(
    makeNumericParam("learning_rate", lower = 0.01, upper = 0.2),
    makeIntegerParam("num_leaves", lower = 10L, upper = 200L),
    makeIntegerParam("min_data_in_leaf", lower = 5L, upper = 100L)
  )

  obj_fun <- makeSingleObjectiveFunction(
    fn = function(x) {
      x <- as.list(x)
      param <- modifyList(param_base, x)
      suppressMessages(suppressWarnings({
        cv <- lgb.cv(params = param,
                     data = dtrain,
                     nfold = 5,
                     nrounds = 100,
                     early_stopping_rounds = 10,
                     verbose = -1)
        return(max(unlist(cv$record_evals$valid$auc$eval)))
      }))
    },
    par.set = ps,
    minimize = FALSE
  )

  ctrl <- makeMBOControl()
  ctrl <- setMBOControlTermination(ctrl, iters = iters)
  ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())
  surrogate <- makeLearner("regr.km", predict.type = "se", config = list(on.par.without.desc = "quiet"))

  run <- suppressMessages(suppressWarnings(mbo(obj_fun, learner = surrogate, control = ctrl)))

  dbest <- suppressMessages(suppressWarnings(lgb.train(params = modifyList(param_base, as.list(run$x)),
                                                       data = dtrain,
                                                       nrounds = 100)))

  y_pred <- predict(dbest, data.matrix(test[, !"target", with = FALSE]))
  if (!require("pROC")) install.packages("pROC")
  library(pROC)
  auc_final <- auc(test$target, y_pred)

  list(run = run, auc = auc_final, predictions = y_pred, test = test)
}


In [None]:
# Paso 6: Ejecutar ambos experimentos
res_base <- train_evaluate(iris_base, iters = 30)

In [None]:
res_augm <- train_evaluate(iris_augmented, iters = 30)

In [None]:
params_log <- data.table(
  experimento = c("original", "duplicado"),
  learning_rate = c(res_base$run$x$learning_rate, res_augm$run$x$learning_rate),
  num_leaves = c(res_base$run$x$num_leaves, res_augm$run$x$num_leaves),
  min_data_in_leaf = c(res_base$run$x$min_data_in_leaf, res_augm$run$x$min_data_in_leaf),
  auc_train = c(as.numeric(res_base$auc), as.numeric(res_augm$auc))
)

pred_base_final <- res_base$predictions
pred_augm_final <- res_augm$predictions
auc_base_final <- auc(res_base$test$target, pred_base_final)
auc_augm_final <- auc(res_augm$test$target, pred_augm_final)

params_log[, auc_test := c(as.numeric(auc_base_final), as.numeric(auc_augm_final))]
print(params_log)

fwrite(params_log, "resumen_resultados.csv")

end_time <- Sys.time()
cat("Tiempo total de ejecución:", round(difftime(end_time, start_time, units = "secs"), 2), "segundos\n")