In [16]:
# Paso 1: Instalación de paquetes necesarios
if (!require("lightgbm")) install.packages("lightgbm")
if (!require("mlrMBO")) install.packages("mlrMBO")
if (!require("data.table")) install.packages("data.table")

library(lightgbm)
library(mlrMBO)
library(data.table)
library(parallel)
library(smoof)
library(mlr)
library(ggplot2)

In [17]:
# Paso 2: Análisis exploratorio simple
iris_dt <- as.data.table(iris)
print(iris_dt[, .N, by = Species])

      Species     N
       <fctr> <int>
1:     setosa    50
2: versicolor    50
3:  virginica    50


In [18]:
# Paso 3: Dataset sin duplicar (baseline)
iris_base <- copy(iris_dt)
iris_base[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_base[, Species := NULL]

In [19]:
# Paso 4: Dataset duplicado (con ruido negativo)
iris_dup <- copy(iris_dt)
iris_dup[, target := ifelse(Species == "virginica", 1L, 0L)]
iris_pos <- iris_dup[target == 1L]
iris_neg_copies <- copy(iris_pos)[, target := 0L]
iris_augmented <- rbindlist(list(iris_dup, iris_neg_copies))
iris_augmented[, Species := NULL]

In [20]:
start_time <- Sys.time()

# Paso 5: Función de entrenamiento y evaluación
train_evaluate <- function(data, seed = 123, iters = 10) {
  set.seed(seed)
  idx_train <- sample(1:nrow(data), 0.7 * nrow(data))
  train <- data[idx_train]
  test <- data[-idx_train]

  dtrain <- lgb.Dataset(data = data.matrix(train[, !"target", with = FALSE]), label = train$target)

  param_base <- list(objective = "binary",
                     metric = "auc",
                     verbosity = -1,
                     feature_pre_filter = FALSE)

  ps <- makeParamSet(
    makeNumericParam("learning_rate", lower = 0.01, upper = 0.2),
    makeIntegerParam("num_leaves", lower = 10L, upper = 100L),
    makeIntegerParam("min_data_in_leaf", lower = 5L, upper = 50L)
  )

  obj_fun <- makeSingleObjectiveFunction(
    fn = function(x) {
      x <- as.list(x)
      param <- modifyList(param_base, x)
      cv <- suppressMessages(lgb.cv(params = param,
                   data = dtrain,
                   nfold = 3,
                   nrounds = 100,
                   early_stopping_rounds = 10,
                   verbose = -1))
      return(max(unlist(cv$record_evals$valid$auc$eval)))
    },
    par.set = ps,
    minimize = FALSE
  )

  ctrl <- makeMBOControl()
  ctrl <- setMBOControlTermination(ctrl, iters = iters)
  ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())
  surrogate <- makeLearner("regr.km", predict.type = "se")

  run <- suppressMessages(mbo(obj_fun, learner = surrogate, control = ctrl))

  dbest <- suppressMessages(lgb.train(params = modifyList(param_base, as.list(run$x)),
                     data = dtrain,
                     nrounds = 100))

  y_pred <- predict(dbest, data.matrix(test[, !"target", with = FALSE]))
  if (!require("pROC")) install.packages("pROC")
  library(pROC)
  auc_final <- auc(test$target, y_pred)

  list(run = run, auc = auc_final, predictions = y_pred, test = test)
}


In [21]:
# Paso 6: Ejecutar ambos experimentos
res_base <- train_evaluate(iris_base, iters = 20)
res_augm <- train_evaluate(iris_augmented, iters = 20)


optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3634012 164 88 
  - best initial criterion value(s) :  8.538131 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -8.5381  |proj g|=     0.084911
At iterate     1  f =      -8.5576  |proj g|=      0.081257
At iterate     2  f =      -8.6204  |proj g|=       0.04403
At iterate     3  f =      -8.6506  |proj g|=     0.0074206
At iterate     4  f =      -8.6516  |proj g|=      0.002045
At iterate     5  f =      -8.6517  |proj g|=     0.0020186
At iterate     6  f =      -8.6517  |proj g|=     0.0020076
At iterate     7  f =      -8.6517  |proj g|=     0.0019586
At iterate     8  f =      -8.6519  |proj g|=     0.0025045
At iterate     9

Loading required package: pROC

Type 'citation("pROC")' for a citation.


Attaching package: ‘pROC’


The following objects are masked from ‘package:stats’:

    cov, smooth, var


Setting levels: control = 0, case = 1

Setting direction: controls < cases




optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3441321 166 88 
  - best initial criterion value(s) :  13.90651 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -13.907  |proj g|=      0.35349
At iterate     1  f =      -14.201  |proj g|=       0.38557
At iterate     2  f =      -15.018  |proj g|=       0.14109
At iterate     3  f =      -15.043  |proj g|=      0.093375
At iterate     4  f =      -15.063  |proj g|=     0.0043872
At iterate     5  f =      -15.063  |proj g|=     0.0043728
At iterate     6  f =      -15.064  |proj g|=     0.0069968
At iterate     7  f =      -15.065  |proj g|=      0.016004
At iterate     8  f =      -15.067  |proj g|=       0.03172
At iterate     9

Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [22]:
# Paso 7: Comparación visual y resumen
# Guardar resultados como tabla CSV
results_summary <- data.table(
  experimento = c("original", "duplicado"),
  auc = c(as.numeric(res_base$auc), as.numeric(res_augm$auc))
)
fwrite(results_summary, "auc_resultados.csv")
cat("AUC dataset original:", res_base$auc, "\n")
cat("AUC dataset duplicado:", res_augm$auc, "\n")

AUC dataset original: 0.9855769 
AUC dataset duplicado: 0.7739234 


In [23]:
# Gráfica ROC del duplicado
ggroc <- ggroc(list(Original = roc_base, Duplicado = roc_dup))
ggplot(ggroc) +
  geom_line(aes(color = name), size = 1.2) +
  scale_color_manual(values = c("Original" = "red", "Duplicado" = "blue")) +
  labs(title = "Curvas ROC - Comparación", x = "1 - Especificidad", y = "Sensibilidad") +
  theme_minimal() +
  theme(legend.title = element_blank())

ggsave("curvas_roc_comparacion.png", width = 7, height = 5)

ERROR: Error: object 'roc_base' not found
