---
title: "Logistische Regression"
jupyter: ir
---


In [None]:
#| include: false
req_pkg <- c("MedDataSets", "data.table", "performance", "tinyplot", "see")
for (r in req_pkg) {
  if (!(r %in% installed.packages()[, "Package"])) {
    install.packages(r)
  }
}


In [None]:
library(MedDataSets); data("Pima_te_df")
dataset <- Pima_te_df |> data.table::data.table() # Daten einlesen
use_cols <- c("glu", "bmi", "age", "type") # Relevante Spalten definieren
# Relevante Spalten filtern, fehlende Werte entfernen
dataset <- dataset[, .SD, .SDcols = use_cols] |> na.omit();
dataset[, ("type") := factor(type, levels = c("No", "Yes"), labels = c("0", "1"))] # Datentyp ändern

str(dataset) # Übersicht über den Datensatz


In [None]:
# Ausgabe der ersten 10 Zeilen
dataset[1:10, ]


In [None]:
#| out-width: 80%
#| fig-align: center
library(tinyplot)
tinyplot(as.numeric(as.character(type)) ~ glu, data = dataset, palette = "Okabe-Ito",
         ylab = "Predicted Probability of Diabetes")


In [None]:
#| out-width: 70%
#| fig-align: center
m1 = glm(as.numeric(as.character(type)) ~ glu, data = dataset, family = "binomial")
# Simuliere Sequenz mit 100 Datenpunkten im Glukose-Wertebereich
newdat <- data.frame(glu = seq(min(dataset$glu), max(dataset$glu), len = 100))
newdat$type_m1 = predict(m1, newdata = newdat, type = "response") # Vorhersage logistische Regression

library(tinyplot)
tinyplot(as.numeric(as.character(type)) ~ glu, data = dataset, palette = "Okabe-Ito",
         ylab = "Predicted Probability of Diabetes")
lines(type_m1 ~ glu, data = newdat, col = "red", lwd = 2)


In [None]:
#| out-width: 70%
#| fig-align: center
m0 = lm(as.numeric(as.character(type)) ~ glu, data = dataset)
newdat$type_m0 = predict(m0, newdata = newdat, type = "response") # Vorhersage lineare Regression
tinyplot(as.numeric(as.character(type)) ~ glu, data = dataset, palette = "Okabe-Ito",
         ylab = "Predicted Probability of Diabetes", main = "Lineare (blau) vs. logistische (rot) Regression")
lines(type_m0 ~ glu, data = newdat, col = "blue", lwd = 2)
lines(type_m1 ~ glu, data = newdat, col = "red", lwd = 2)


In [None]:
#| out-width: 70%
#| fig-align: center
prob_to_odds <- function(prob) {odds <- prob / (1 - prob); return(odds)} # Odds-Berechnung
# Logit [--> `log(odds)`] aus den vorhergesagten Wahrscheinlichkeiten berechnen
newdat$type_m1_logit <- newdat$type_m1 |> prob_to_odds() |> log()
# Zur Visualisierung im Plot wird der Logit zwischen 0 und 1 skaliert
minmax <- function(x) {(x - min(x)) / (max(x) - min(x))}
newdat$type_m1_logit_norm <- newdat$type_m1_logit |> minmax()
tinyplot(as.numeric(as.character(type)) ~ glu, data = dataset, palette = "Okabe-Ito",
         ylab = "Predicted Probability of Diabetes", main = "ln(Odds)")
lines(type_m1_logit_norm ~ glu, data = newdat, col = "red", lwd = 2)


In [None]:
summary(m1)


In [None]:
# 95%-CI Koeffizienten
confint(m1, level = 0.95)


In [None]:
# 95%-CI Odds-Ratio
exp(confint(m1, level = 0.95))


In [None]:
m2 = glm(as.numeric(as.character(type)) ~ glu + bmi, data = dataset, family = "binomial")
# Simuliere Sequenz mit 100 BMI-Werten, diese müssen zu neuen Daten ergänzt werden
newdat$bmi <- seq(min(dataset$bmi), max(dataset$bmi), len = 100)
newdat$type_m2 = predict(m2, newdata = newdat, type = "response")


In [None]:
summary(m2)


In [None]:
# Odds ratio
exp(coef(m2))


In [None]:
# 95%-CI Odds ratio
exp(confint(m2, level = 0.95))


In [None]:
#| out-width: 70%
#| fig-align: center
# Vergleich zweier binärer Klassifikatoren mittels ROC-Kurve
# Um ein Overfitting zu vermeiden, teilen wir den Datensatz in
# 75% Trainingsdaten und 25% Testdaten
folds <- sample(nrow(dataset), size = nrow(dataset) * 0.25, replace = FALSE)
train_data <- dataset[-folds, ]
test_data <- dataset[folds, ]
# Die zwei Modelle (einfache / multiple LR) werden nun auf den Trainingsdaten trainiert
m3 <- glm(as.numeric(as.character(type)) ~ glu, data = train_data, family = "binomial")
m4 <- glm(as.numeric(as.character(type)) ~ glu + bmi, data = train_data, family = "binomial")
# Die ROC-Kurve wird auf dem Test-Datensatz berechnet
roc <- performance::performance_roc(m3, m4, new_data = test_data)
plot(roc)