# Daten und Modelloptimierung - Teil 2

## Quelle der Daten

https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data (zuletzt aufgerufen: 01/2024)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 (zuletzt aufgerufen: 01/2024)

## Installation der Bibliotheken

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import StratifiedKFold

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

## Einlesen der Daten

In [None]:
# Definieren der URL der Datenquelle und Laden der Daten in einen pandas DataFrame
data_url = "https://github.com/timwgnd/Lehrbuch-Kuenstliche-Intelligenz-in-der-Medizin/raw/refs/heads/main/Brustkrebs.xlsx"
data = pd.read_excel(io=data_url, sheet_name = "Tabelle1")

# Entfernen von Zeilen mit fehlenden Werten
data = data.dropna()

# Anzeigen der ersten Zeilen des DataFrames, um einen Überblick über die Daten zu erhalten
print(data.head().to_markdown(index=False, tablefmt='psql'))

In [None]:
# Ersetzen der Diagnose-Werte ("benign" und "malignant") durch numerische Werte (0 und 1)
diagnosis_new = {"benign": 0, "malignant": 1}

data["diagnosis"] = data["diagnosis"].replace(diagnosis_new)

print(data.head().to_markdown(index=False, tablefmt='psql'))

## Aufteilung der Daten

In [None]:
# Aufteilen der Daten in Features (x) und Zielvariable (y)
x = data.iloc[:, 1:]

y = data.iloc[:, 0]

In [None]:
# Anzeigen der ersten Zeilen des Feature-Datensatzes (x)
print(x.head().to_markdown(index=False, tablefmt='psql'))

In [None]:
# Anzeigen der ersten Zeilen der Zielvariablen (y)
print(y.head().to_markdown(index=False, tablefmt='psql'))

## Principal Component Analysis

In [None]:
scaler = StandardScaler()

scaler.fit(x)

scaled_data = scaler.transform(x)

In [None]:
pca = PCA(n_components = 15)

pca.fit(scaled_data)

In [None]:
PC_values = np.arange(pca.n_components_) + 1

plt.plot(PC_values, pca.explained_variance_ratio_, "o-", linewidth = 2)
plt.xticks(np.arange(1, len(PC_values)+1, 1))

plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")

In [None]:
pca = PCA(n_components = 3)

pca.fit(scaled_data)

In [None]:
PC_values = np.arange(pca.n_components_) + 1

plt.plot(PC_values, pca.explained_variance_ratio_, "o-", linewidth = 2)
plt.xticks(np.arange(1, len(PC_values)+1, 1))

plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")

In [None]:
x_pca = pca.transform(scaled_data)

print(x_pca)

In [None]:
plt.figure(figsize = (8,6))

plt.scatter(x_pca[:,0],x_pca[:,1], c = data["diagnosis"])

plt.xlabel("PC1")
plt.ylabel("PC2")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size = 0.15)

## Erstellen, Trainieren und Evaluieren des KI-Modells

In [None]:
model_1 = tf.keras.models.Sequential()

model_1.add(tf.keras.layers.Dense(64, activation = tf.nn.relu))
model_1.add(tf.keras.layers.Dense(256, activation = tf.nn.relu))
model_1.add(tf.keras.layers.Dense(128, activation = tf.nn.relu))

model_1.add(tf.keras.layers.Dense(2, activation = tf.nn.softmax))

In [None]:
model_2 = tf.keras.models.Sequential()

model_2.add(tf.keras.layers.Dense(64, activation = tf.nn.relu, 
                                kernel_initializer = "he_uniform", 
                                kernel_regularizer = tf.keras.regularizers.L1(0.01),
                                bias_regularizer = tf.keras.regularizers.L2(0.01)))
model_2.add(tf.keras.layers.Dense(256, activation = tf.nn.relu))
model_2.add(tf.keras.layers.Dense(128, activation = tf.nn.relu))

model_2.add(tf.keras.layers.Dense(2, activation = tf.nn.softmax))

In [None]:
model_1.compile(optimizer = "SGD",
              loss = "sparse_categorical_crossentropy",
              metrics = "accuracy")

In [None]:
model_1.fit(x_train, y_train, epochs = 5)

In [None]:
model_1.evaluate(x_test, y_test)

## Hyperparameter-Optimierung mit Random Search

In [None]:
def create_model(optimizer = "SGD", loss = "sparse_categorical_crossentropy", neurons = 128):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(neurons, activation = tf.nn.relu))
    model.add(tf.keras.layers.Dense(256, activation = tf.nn.relu))
    model.add(tf.keras.layers.Dense(neurons, activation = tf.nn.relu))
    
    model.add(tf.keras.layers.Dense(2, activation = tf.nn.softmax))

    model.compile(optimizer = optimizer, loss = loss, metrics = ["accuracy"])
    
    return model

In [None]:
model_1 = KerasClassifier(build_fn = create_model, verbose = 0)

In [None]:
param_dist = {
    "neurons": [64, 128],
    "optimizer": ["adam", "SGD"],
    "loss":["sparse_categorical_crossentropy", "MSE"]}

In [None]:
random_search = RandomizedSearchCV(estimator = model_1, param_distributions = param_dist, 
                                   cv = StratifiedKFold(n_splits = 5), n_jobs = -1)
random_search_result = random_search.fit(x, y)  

In [None]:
print("Best: %f using %s" % (random_search_result.best_score_, 
                             random_search_result.best_params_))

In [None]:
model_3 = tf.keras.models.Sequential()

model_3.add(tf.keras.layers.Dense(64, activation = tf.nn.relu))
model_3.add(tf.keras.layers.Dense(256, activation = tf.nn.relu))
model_3.add(tf.keras.layers.Dense(64, activation = tf.nn.relu))

model_3.add(tf.keras.layers.Dense(2, activation = tf.nn.softmax))

model_3.compile(optimizer = "adam",
              loss = "sparse_categorical_crossentropy",
              metrics = "accuracy")

model_3.fit(x_train, y_train, epochs = 5)

In [None]:
model_3.evaluate(x_test, y_test)