In [17]:
format(Sys.time(), "%a %b %d %X %Y")

In [18]:
# limpio la memoria
rm(list=ls(all.names=TRUE)) # remove all objects
gc(full=TRUE, verbose=FALSE) # garbage collection

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,2011519,107.5,2978672,159.1,2978672,159.1
Vcells,3556634,27.2,41189062,314.3,51486327,392.9


In [19]:
PARAM <- list()
PARAM$experimento <- 50002

In [20]:
# Paso 1: Instalación de paquetes necesarios
if (!require("lightgbm")) install.packages("lightgbm")
if (!require("mlrMBO")) install.packages("mlrMBO")
if (!require("data.table")) install.packages("data.table")
if (!require("DiceKriging")) install.packages("DiceKriging")

library(DiceKriging)
library(lightgbm)
library(mlrMBO)
library(data.table)
library(parallel)
library(smoof)
library(mlr)


In [21]:
# carpeta de trabajo
setwd("/content/buckets/b1/exp")
experimento_folder <- paste0("Cuantico", PARAM$experimento)
dir.create(experimento_folder, showWarnings=FALSE)
setwd( paste0("/content/buckets/b1/exp/", experimento_folder ))

In [22]:
df <- fread("/content/datasets/creditcard.csv")

df[, Class := as.integer(Class)]
cat("Distribución original (positivo = fraude = 1):\n")
print(df[, .N, by = Class])

Distribución original (positivo = fraude = 1):
   Class      N
   <int>  <int>
1:     0 284315
2:     1    492


In [23]:
# Separar train/test estratificado (1 sola vez)
set.seed(123)
df[, idx := .I]
train_idx <- df[, .SD[sample(.N * 0.7)], by = Class]$idx
train <- df[J(train_idx), on = "idx"]
test  <- df[!J(train_idx), on = "idx"]

In [24]:
# Columnas predictoras
features <- setdiff(names(train), c("Class", "idx"))

In [25]:
optimizar_y_entrenar <- function(dataset, nombre_escenario) {
  dataset <- copy(dataset)
  dataset[, idx := NULL]
  
  dtrain <- lgb.Dataset(data = data.matrix(dataset[, ..features]), label = dataset$Class)
  
  param_base <- list(objective="binary", metric="auc", verbosity=-1, feature_pre_filter=FALSE)
  ps <- makeParamSet(
    makeNumericParam("learning_rate", lower=0.01, upper=0.2),
    makeIntegerParam("num_leaves", lower=10L, upper=100L),
    makeIntegerParam("min_data_in_leaf", lower=5L, upper=50L)
  )
  
  obj_fun <- makeSingleObjectiveFunction(
    fn = function(x) {
      x <- as.list(x)
      param <- modifyList(param_base, x)
      cv <- lgb.cv(params=param, data=dtrain, nfold=3, nrounds=100, early_stopping_rounds=10, verbose=-1)
      return(max(unlist(cv$record_evals$valid$auc$eval)))
    },
    par.set = ps, minimize = FALSE
  )
  
  ctrl <- makeMBOControl()
  ctrl <- setMBOControlTermination(ctrl, iters=30)
  ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())
  surrogate <- makeLearner("regr.km", predict.type="se")
  
  run <- suppressMessages(suppressWarnings(mbo(obj_fun, learner=surrogate, control=ctrl)))
  
  # Entrenar modelo final con mejores hiperparámetros
  best_param <- modifyList(param_base, as.list(run$x))
  modelo <- lgb.train(params = best_param, data = dtrain, nrounds = 100, verbose = -1)
  
  # Predecir en test
  yhat <- predict(modelo, data.matrix(test[, ..features]))
  auc_test <- auc(test$Class, yhat)
  
  return(data.table(
    escenario = nombre_escenario,
    learning_rate = run$x$learning_rate,
    num_leaves = run$x$num_leaves,
    min_data_in_leaf = run$x$min_data_in_leaf,
    auc_test = as.numeric(auc_test)
  ))
}


In [26]:
# Escenario original
res_original <- optimizar_y_entrenar(train, "original")

# Escenario con duplicación de positivos como negativos
duplicados <- copy(train[Class == 1])
duplicados[, Class := 0]
train_duplicado <- rbind(train, duplicados)

res_duplicado <- optimizar_y_entrenar(train_duplicado, "duplicado")

# Combinar resultados
resumen <- rbind(res_original, res_duplicado)
print(resumen)



optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3419455 160 82 
  - best initial criterion value(s) :  37.85172 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -37.852  |proj g|=      0.13559
At iterate     1  f =      -38.073  |proj g|=       0.11303
At iterate     2  f =      -38.181  |proj g|=      0.099643
At iterate     3  f =      -38.288  |proj g|=       0.29402
At iterate     4  f =      -38.307  |proj g|=       0.15182
At iterate     5  f =      -38.307  |proj g|=      0.035843
At iterate     6  f =      -38.307  |proj g|=      0.018412
At iterate     7  f =      -38.307  |proj g|=      0.018504
At iterate     8  f =      -38.307  |proj g|=      0.060052
At iterate     9

Setting levels: control = 0, case = 1

Setting direction: controls < cases




optimisation start
------------------
* estimation method   : MLE 
* optimisation method : BFGS 
* analytical gradient : used
* trend model : ~1
* covariance model : 
  - type :  matern5_2 
  - nugget : NO
  - parameters lower bounds :  1e-10 1e-10 1e-10 
  - parameters upper bounds :  0.3389201 166 86 
  - best initial criterion value(s) :  17.07944 

N = 3, M = 5 machine precision = 2.22045e-16
At X0, 0 variables are exactly at the bounds
At iterate     0  f=      -17.079  |proj g|=       0.2843
At iterate     1  f =      -17.225  |proj g|=        0.2695
At iterate     2  f =       -17.42  |proj g|=       0.15584
At iterate     3  f =      -17.548  |proj g|=      0.051774
At iterate     4  f =      -17.569  |proj g|=      0.013955
At iterate     5  f =      -17.572  |proj g|=      0.013345
At iterate     6  f =      -17.572  |proj g|=      0.013305
At iterate     7  f =      -17.572  |proj g|=      0.013245
At iterate     8  f =      -17.574  |proj g|=      0.013152
At iterate     9

Setting levels: control = 0, case = 1

Setting direction: controls < cases



   escenario learning_rate num_leaves min_data_in_leaf  auc_test
      <char>         <num>      <int>            <int>     <num>
1:  original    0.01862937         10               16 0.9262816
2: duplicado    0.01000595         30               46 0.9568304
