---
title: "Data Splitting"
jupyter: ir
---


In [None]:
#| include: false
req_pkg <- c("splitTools", "mlbench", "knitr", "kdry")
for (r in req_pkg) {
  if (!(r %in% installed.packages()[, "Package"])) {
    install.packages(r)
  }
}


In [None]:
#| echo: true
library(splitTools)
library(mlbench)
data("BostonHousing")

# Die Funktion erzeugt eine Liste mit Indices 
# für die jeweiligen Datensets
data_split <- splitTools::partition(
  y = BostonHousing$medv,
  p = c(train = 0.5, validation = 0.25, test = 0.25),
  type = "stratified",
  seed = 123
)


In [None]:
#| include: false

# Variablen auswählen für tabellarische Darstellung
cols <- c("crim", "zn", "indus", "age", "rad", "ptratio", "lstat")
results_table <- data.table::data.table()

for (n in names(data_split)) {
  add_col <- sapply(BostonHousing[data_split[[n]], cols], kdry::rep_mean_sd) |>
    cbind()
  results_table <- cbind(results_table, add_col)
}

# Zeilenbeschriftungen
results_table <- cbind(rownames(add_col), results_table)
# Spaltenbeschriftungen
colnames(results_table) <- c("Variable", names(data_split))
colnames(results_table)[2:4] <- c(
  paste0("train (n=", length(data_split[["train"]]), ")"),
  paste0("validation (n=", length(data_split[["validation"]]), ")"),
  paste0("test (n=", length(data_split[["test"]]), ")")
)


In [None]:
knitr::kable(results_table, caption = "Train-Validation-Test Split: Mittelwert und Standardabweichung.")