In [2]:
# ============================================================
# Setup
# ============================================================
set.seed(123)

suppressPackageStartupMessages({
  library(data.table)
  library(xgboost)
  library(caret)
  library(pROC)
  library(PRROC)
  library(ggplot2)
})

In [3]:
# ============================================================
# 1) Datos: baseline y utilidades
# ============================================================
iris_dt <- as.data.table(iris)
iris_dt[, target := as.integer(Species == "virginica")]
iris_dt[, Species := NULL]
feature_cols <- setdiff(names(iris_dt)[sapply(iris_dt, is.numeric)], "target")

In [4]:
# ============================================================
# 2) Augmentación con grupos
# ============================================================
augment_negatives <- function(dt, n_extra = 50L, noise_sd = 0.15, weight_aug = 0.5, cols = NULL) {
  if (is.null(cols)) cols <- setdiff(names(dt)[sapply(dt, is.numeric)], "target")
  
  dt <- copy(dt)[, w := 1L]
  dt[, group_id := NA_integer_]
  
  neg <- dt[target == 0]
  neg[, group_id := .I]  # asignar grupo único
  
  idx <- sample(seq_len(nrow(neg)), size = n_extra, replace = TRUE)
  aug <- copy(neg[idx])
  
  sds <- sapply(dt[, ..cols], sd)
  for (col in cols) {
    sd_col <- ifelse(is.na(sds[[col]]) || sds[[col]] == 0, 1, sds[[col]])
    aug[[col]] <- aug[[col]] + rnorm(nrow(aug), 0, sd_col * noise_sd)
  }
  
  aug[, w := weight_aug]
  # duplicados heredan el group_id del original
  aug[, group_id := neg[idx]$group_id]
  
  # combinar y asignar grupo individual a positivos
  combined <- rbindlist(list(dt, aug), use.names = TRUE)
  combined[is.na(group_id), group_id := .I + max(group_id, na.rm = TRUE)]
  
  combined[]
}

In [5]:
# ============================================================
# 3) Folds por grupo
# ============================================================
generate_group_folds <- function(dt, k = 5, seed = 123) {
  set.seed(seed)
  groups <- dt$group_id
  groupKFold(groups, k = k)
}

In [6]:
# ============================================================
# 4) Entrenar y evaluar
# ============================================================
train_evaluate <- function(dt,
                           folds,
                           params = list(),
                           nrounds = 500,
                           early_stopping_rounds = 30,
                           seed = 123,
                           threshold = NULL) {
  if (!"w" %in% names(dt)) dt[, w := 1L]
  
  dtrain <- xgb.DMatrix(
    data = as.matrix(dt[, ..feature_cols]),
    label = dt$target,
    weight = dt$w
  )
  
  base_params <- list(
    objective = "binary:logistic",
    eval_metric = "auc",
    max_depth = 3,
    eta = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8
  )
  
  par <- modifyList(base_params, params)
  
  set.seed(seed)
  cv <- xgb.cv(
    params = par,
    data = dtrain,
    folds = folds,
    nrounds = nrounds,
    early_stopping_rounds = early_stopping_rounds,
    verbose = 0
  )
  
  best_nrounds <- cv$best_iteration
  
  # entrenamiento final (usar todo como train)
  model <- xgb.train(
    params = par,
    data = dtrain,
    nrounds = best_nrounds,
    verbose = 0
  )
  
  # predicciones y métricas
  pred_prob <- predict(model, dtrain)
  roc_obj <- pROC::roc(response = dt$target, predictor = pred_prob, quiet = TRUE)
  auc_val <- as.numeric(pROC::auc(roc_obj))
  
  pr_obj <- PRROC::pr.curve(
    scores.class0 = pred_prob[dt$target == 1],
    scores.class1 = pred_prob[dt$target == 0],
    curve = TRUE
  )
  pr_auc <- pr_obj$auc.integral
  
  thr <- if (is.null(threshold)) {
    as.numeric(pROC::coords(roc_obj, "best", ret = "threshold"))
  } else threshold
  
  pred_lbl <- as.integer(pred_prob >= thr)
  TP <- sum(pred_lbl == 1 & dt$target == 1)
  FP <- sum(pred_lbl == 1 & dt$target == 0)
  TN <- sum(pred_lbl == 0 & dt$target == 0)
  FN <- sum(pred_lbl == 0 & dt$target == 1)
  
  accuracy  <- (TP + TN) / (TP + TN + FP + FN)
  precision <- ifelse(TP + FP == 0, NA, TP / (TP + FP))
  recall    <- ifelse(TP + FN == 0, NA, TP / (TP + FN))
  f1        <- ifelse(is.na(precision) | is.na(recall), NA, 2 * precision * recall / (precision + recall))
  
  list(
    model = model,
    auc = auc_val,
    pr_auc = pr_auc,
    threshold = thr,
    metrics = list(
      accuracy = accuracy,
      precision = precision,
      recall = recall,
      f1 = f1
    ),
    roc = roc_obj,
    pr = pr_obj
  )
}

In [None]:
# ============================================================
# 5) Ejecutar
# ============================================================
# 5) Ejecutar
dt_aug <- augment_negatives(
  iris_dt,
  n_extra   = 50,
  noise_sd  = 0.15,
  weight_aug = 0.5
)

folds <- generate_group_folds(dt_aug, k = 5, seed = 2024)

# Chequeos rápidos (opcional pero útil)
stopifnot(is.list(folds), all(lengths(folds) > 0))
stopifnot(all(unlist(folds) >= 1), all(unlist(folds) <= nrow(dt_aug)))
stopifnot(length(unique(unlist(folds))) <= nrow(dt_aug))

results <- train_evaluate(dt_aug, folds = folds)

print(results$metrics)
cat("AUC:", results$metrics$auc, "\n")
cat("PR AUC:", results$metrics$pr_auc, "\n")


In [None]:
# ============================================================
# 6) Resultados
# ============================================================
print(results$metrics)
cat("AUC:", results$auc, "\n")
cat("PR AUC:", results$pr_auc, "\n")
cat("Umbral de decisión:", results$threshold, "\n")

“0.500000 (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (column 6 named 'w')”
“no non-missing arguments to max; returning -Inf”
“-inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (column 7 named 'group_id')”


ERROR: Error in if ((env$maximize && score > env$best_score) || (!env$maximize && : missing value where TRUE/FALSE needed
