---
title: "ML-Einführung - Unsupervized Learning"
jupyter: ir
---


In [None]:
#| echo: false
#| include: false
# Daten laden (Setup aus ML-Einführung)

req_pkg <- c(
  "riskCommunicator", "data.table", "tinyplot", "see",
  "splitTools", "tidymodels", "parsnip", "yardstick"
)
for (r in req_pkg) {
  if (!(r %in% installed.packages()[, "Package"])) {
    install.packages(r)
  }
}

dataset_full <- riskCommunicator::framingham |>
  data.table::data.table() # Daten einlesen
# Subset: Basisuntersuchung
dataset <- dataset_full[get("PERIOD") == 1, ]

# Relevante Spalten definieren
use_cols <- c("SEX", "TOTCHOL", "AGE", "SYSBP",
"CURSMOKE", "CIGPDAY", "BMI", "DIABETES",
"HYPERTEN")
# Relevante Spalten filtern, fehlende Werte entfernen
dataset <- dataset[
  , .SD, .SDcols = use_cols
] |>na.omit()

# Transformieren der katgeorialen Variablen
# "SEX" "CURSMOKE" "CIGPDAY" "DIABETES" "HYPERTEN"
cat_vars <- use_cols[c(1, 5, 8, 9)]
# Datentyp "factor" ändern
dataset[, (cat_vars) := lapply(
  X = .SD,
  FUN = factor),
  .SDcols = cat_vars
]
rm(dataset_full, cat_vars, use_cols, r, req_pkg)

# Die Funktion erzeugt eine Liste mit Indices für die jeweiligen Datensets
# Das stratifizierte Splitten anhand der Zielvariable "HYPERTEN" soll deren
# gleichmäßige Verteilung in den Teildatensätzen sicherstellen.
data_splits <- splitTools::partition(
  y = dataset$HYPERTEN,
  p = c(train = 0.7, validation = 0.15, test = 0.15),
  type = "stratified",
  seed = 123
)

# Teildatensätze für Regressions- und Klassifizierungs-Beispiele
# Regression: Zielvariable "SYSBP" --> Entfernen von "HYPERTEN"
dataset_reg <- dataset[
  , .SD, .SDcols = setdiff(colnames(dataset), "HYPERTEN")
]
# Klassifikation: Zielvariable "HYPERTEN" --> Entfernen von "SYSBP"
dataset_cls <- dataset[
  , .SD, .SDcols = setdiff(colnames(dataset), "SYSBP")
]


In [None]:
#| echo: true
# Da kategoriale Daten keine Varianz
# aufweisen, müssen diese für die PCA
# entfernt werden
set.seed(123)
dataset_pca <- subset(
  dataset_cls[data_splits$train, ],
  select = colnames(dataset_cls)[
    !dataset_cls[, lapply(.SD, is.factor)]
  ]
)
pca_m1 <- dataset_pca |> stats::prcomp(scale. = TRUE)

# Matrix der Ladungen der Hauptkomponenten
pca_m1


In [None]:
#| echo: true
# Übersicht über die Principal Component-Werte
# für die ersten sechs Beobachtungen
head(pca_m1$x, n = 6)

# Importance-Summary
pca_m1_s <- summary(pca_m1)
pca_m1_s


In [None]:
#| echo: true
#| out-width: 80%
#| fig-align: center
tinyplot::plt(
  x = pca_m1$x[, "PC1"],
  y = pca_m1$x[, "PC2"],
  xlab = "PC1",
  ylab = "PC2"
)


In [None]:
#| echo: true
#| out-width: 93%
#| fig-align: center
hyperten_lbl <- dataset_cls[
  data_splits$train, get("HYPERTEN")]
tinyplot::plt(x = pca_m1$x[, "PC1"],
  y = pca_m1$x[, "PC2"], by = hyperten_lbl,
  xlab = "PC1", ylab = "PC2",
  legend = list(title = "HYPERTEN"))


In [None]:
#| echo: true
#| out-width: 45%
#| fig-align: center
tinyplot::plt(x = c(1:4), y = round(pca_m1_s$importance[2, ] * 100),
  ylab = "Anteil der erklärten Varianz (%)", xlab = "Hauptkomponente",
  type = "b", draw = FALSE
)


In [None]:
#| echo: true
set.seed(123)
dataset_clust <- subset(
  dataset_cls[data_splits$train, ],
  select = -HYPERTEN
)
clust_m1 <- dataset_clust |>
  stats::kmeans(centers = 2)  # "centers" entspricht K

# Summary
clust_m1$centers


In [None]:
#| echo: false
#| out-width: 100%
#| fig-align: center
#| layout-ncol: 2
tinyplot::plt(
  x = dataset_clust$TOTCHOL,
  y = dataset_clust$BMI,
  by = factor(clust_m1$cluster), xlab = "TOTCHOL", ylab = "BMI",
  legend = list(title = "K=2")
)
tinyplot::plt(
  x = dataset_clust$AGE,
  y = dataset_clust$TOTCHOL,
  by = factor(clust_m1$cluster), xlab = "AGE", ylab = "TOTCHOL",
  legend = list(title = "K=2")
)
tinyplot::plt(
  x = dataset_clust$CIGPDAY,
  y = dataset_clust$TOTCHOL,
  by = factor(clust_m1$cluster), xlab = "CIGPDAY", ylab = "TOTCHOL",
  legend = list(title = "K=2")
)
tinyplot::plt(
  x = dataset_clust$AGE,
  y = dataset_clust$CIGPDAY,
  by = factor(clust_m1$cluster), xlab = "AGE", ylab = "CIGPDAY",
  legend = list(title = "K=2")
)


In [None]:
#| echo: true
set.seed(123)
# Suche nach einem guten Wert für K
twsq <- c() # total within-sum-of-squares
for (k in 1:10) {
  clust_m <- dataset_clust |>
    stats::kmeans(centers = k)
  twsq <- c(twsq, clust_m$tot.withinss)
}


In [None]:
#| echo: true
#| out-width: 75%
#| fig-align: center
tinyplot::plt(x = c(1:10), y = twsq,
  ylab = "Summe der 'within-cluster' Varianzen",
  xlab = "K", type = "b", draw = FALSE
)