# Logistische Regression

## Quelle der Daten

https://www.kaggle.com/datasets/captainozlem/framingham-chd-preprocessed-data (zuletzt aufgerufen: 01/2024)

https://www.framinghamheartstudy.org/ (zuletzt aufgerufen: 01/2024)

## Installation der Bibliotheken

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

import numpy as np

## Einlesen der Daten

In [None]:
data_url = "https://github.com/timwgnd/Lehrbuch-Kuenstliche-Intelligenz-in-der-Medizin/raw/refs/heads/main/FraminghamHeartStudy.xlsx"
data = pd.read_excel(io=data_url, sheet_name = "Tabelle1")

data = data.dropna()

print(data.head().to_markdown(index=False, tablefmt='psql'))

In [None]:
data_new = pd.get_dummies(data, columns = ["Geschlecht"])

print(data_new.head().to_markdown(index=False, tablefmt='psql'))

In [None]:
count_diabetes = data_new["Diabetes"].value_counts()

print(count_diabetes.to_markdown(tablefmt='psql'))

count_diabetes.plot(kind = "bar", rot = 0)

## Aufteilung der Daten und Resampling

In [None]:
x = data_new.iloc[:, 13].values.reshape(-1, 1)

y = data_new.iloc[:, 7]

In [None]:
print(x)

In [None]:
print(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)

In [None]:
sm = SMOTE()

x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

print(y_train_res.value_counts().to_markdown(tablefmt='psql'))

## Definition, Training und Evaluation des Modells

In [None]:
model = LogisticRegression()

model.fit(x_train_res, y_train_res)

model.score(x_test, y_test)

In [None]:
plt.scatter(x_test, y_test, color = "gray")

x_values = np.linspace(min(x_test), max(x_test), 100).reshape(-1, 1)
y_pred = model.predict_proba(x_values)[:, 1]

plt.plot(x_values, y_pred, color = "blue")

plt.xlabel("Blutzucker")
plt.ylabel("Diabetes")

In [None]:
x = data_new.iloc[:, [1, 8, 13]]

y = data_new.iloc[:, 7]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)

In [None]:
sm = SMOTE()

x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

print(y_train_res.value_counts())

In [None]:
model_2 = LogisticRegression()

model_2.fit(x_train_res, y_train_res)

model_2.score(x_test, y_test)