# Daten und Modelloptimierung - Teil 1

## Quelle der Daten

https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data (zuletzt aufgerufen: 01/2024)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 (zuletzt aufgerufen: 01/2024)

## Installation der Bibliotheken

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn import svm

from sklearn.model_selection import cross_validate

from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import confusion_matrix

from sklearn.metrics import plot_confusion_matrix

from sklearn.metrics import plot_roc_curve, auc

## Einlesen der Daten 

In [None]:
data_url = "https://github.com/timwgnd/Lehrbuch-Kuenstliche-Intelligenz-in-der-Medizin/raw/refs/heads/main/Brustkrebs.xlsx"
data = pd.read_excel(io=data_url, sheet_name = "Tabelle1")

data = data.dropna()

data.head()

In [None]:
diagnosis_new = {"benign": 0, "malignant": 1}

data["diagnosis"] = data["diagnosis"].replace(diagnosis_new)

data.head()

In [None]:
count_diabetes = data["diagnosis"].value_counts()

count_diabetes.plot(kind = "bar", rot = 0)

## Aufteilung der Daten und Resampling

In [None]:
x = data.iloc[:, 1:]

y = data.iloc[:, 0]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)

In [None]:
# Nicht verwendet, da die Daten ausreichend balanciert sind

sm = SMOTE()

x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

print(y_train_res.value_counts())

## Erstellen und Trainieren des KI-Modells

In [None]:
model = svm.SVC(kernel = "linear")

model.fit(x_train, y_train)

## Cross Validation

In [None]:
scores = cross_validate(model, x, y, scoring = "accuracy", cv = 10)

print(scores["test_score"])

print("Gesamt-Accuracy:", sum(scores["test_score"])/float(len(scores["test_score"])))

## Bagging

In [None]:
bagging = BaggingClassifier(model, n_estimators = 10, max_samples = 8, bootstrap = True)

bagging.fit(x_train, y_train)

print("Accuracy:", bagging.score(x_test, y_test))

## Modellevaluation

In [None]:
print("Accuracy:", model.score(x_test, y_test))

In [None]:
y_pred = model.predict(x_test)

confusion_matrix(y_test, y_pred)

In [None]:
confusion_matrix = plot_confusion_matrix(model, x_test, y_test)

confusion_matrix.ax_.set_title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")

plt.show()

In [None]:
#Sensitivität = RP/(RP+FN)
print("Sensitivity:", 50/(50+4))

#Spezifität = RN/(RN+FP)
print("Specificity:", 31/(31+1))

#Positiver Prädiktiver Wert = RP/(RP+FP)
print("Precision/PPW:", 50/(50+1))

#Negativer Prädiktiver Wert = RN/(RN+FN)
print("Recall/NPW:", 31/(31+4))

#F1-Score = 2 * ((PPW * NPW) / (PPW + NPW))
print("F1:", 2*((50/(50+1))*(31/(31+4)))/((50/(50+1))+(31/(31+4))))

In [None]:
plot_roc_curve(model, x_test, y_test)