# Clustering

## Installation der Bibliotheken

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.preprocessing import MinMaxScaler

from scipy.cluster import hierarchy

from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import silhouette_samples, silhouette_score

## Einlesen und Visualisierung der Daten

In [None]:
data_url = "https://github.com/timwgnd/Lehrbuch-Kuenstliche-Intelligenz-in-der-Medizin/raw/refs/heads/main/Pneumonie_Clustering.xlsx"
data = pd.read_excel(io=data_url, sheet_name = "Tabelle1")

data = data.dropna()

data.head()

In [None]:
Diagnose_neu = {"gesund": 0, "COVID-19": 1, "BaktPneumonie": 2}

data["Diagnose"] = data["Diagnose"].replace(Diagnose_neu)

data.head()

In [None]:
plt.scatter(data["CRP (mg/dl)"], data["Temperatur (°C)"])

plt.xlabel("CRP (mg/dl)")
plt.ylabel("Temperatur (°C)")

In [None]:
group1 = data[data["Diagnose"] == 0]
group2 = data[data["Diagnose"] == 1]
group3 = data[data["Diagnose"] == 2]

plt.scatter(group1["CRP (mg/dl)"], group1["Temperatur (°C)"], label = "gesund")
plt.scatter(group2["CRP (mg/dl)"], group2["Temperatur (°C)"], label = "COVID-19")
plt.scatter(group3["CRP (mg/dl)"], group3["Temperatur (°C)"], label = "BaktPneumonie")

plt.xlabel("CRP (mg/dl)")
plt.ylabel("Temperatur (°C)")

plt.legend()

## k-Means-Clustering

In [None]:
kmeans = KMeans(n_clusters = 3)

model_kmeans = kmeans.fit_predict(data[["CRP (mg/dl)","Temperatur (°C)"]])

In [None]:
data["cluster"] = model_kmeans

print(data)

In [None]:
group1 = data[data.cluster == 0]
group2 = data[data.cluster == 1]
group3 = data[data.cluster == 2]

plt.scatter(group1["CRP (mg/dl)"], group1["Temperatur (°C)"], label = "gesund")
plt.scatter(group2["CRP (mg/dl)"], group2["Temperatur (°C)"], label = "COVID-19")
plt.scatter(group3["CRP (mg/dl)"], group3["Temperatur (°C)"], label = "BaktPneumonie")

plt.xlabel("CRP (mg/dl)")
plt.ylabel("Temperatur (°C)")

plt.legend()

In [None]:
scaler = MinMaxScaler()

scaler.fit(data[["Temperatur (°C)"]])
data["Temperatur (°C)_skaliert"] = scaler.transform(data[["Temperatur (°C)"]])

scaler.fit(data[["CRP (mg/dl)"]])
data["CRP (mg/dl)_skaliert"] = scaler.transform(data[["CRP (mg/dl)"]])

In [None]:
group1 = data[data["Diagnose"] == 0]
group2 = data[data["Diagnose"] == 1]
group3 = data[data["Diagnose"] == 2]

plt.scatter(group1["CRP (mg/dl)_skaliert"], group1["Temperatur (°C)_skaliert"], label = "gesund")
plt.scatter(group2["CRP (mg/dl)_skaliert"], group2["Temperatur (°C)_skaliert"], label = "COVID-19")
plt.scatter(group3["CRP (mg/dl)_skaliert"], group3["Temperatur (°C)_skaliert"], label = "BaktPneumonie")

plt.xlabel("CRP (mg/dl)_skaliert")
plt.ylabel("Temperatur (°C)_skaliert")

plt.legend()

In [None]:
kmeans = KMeans(n_clusters = 3)

model_kmeans_skaliert = kmeans.fit_predict(data[["CRP (mg/dl)_skaliert",
                                                 "Temperatur (°C)_skaliert"]])

In [None]:
data["cluster"] = model_kmeans_skaliert

print(data)

In [None]:
group1 = data[data.cluster == 0]
group2 = data[data.cluster == 1]
group3 = data[data.cluster == 2]

plt.scatter(group1["CRP (mg/dl)_skaliert"], group1["Temperatur (°C)_skaliert"], label = "gesund")
plt.scatter(group2["CRP (mg/dl)_skaliert"], group2["Temperatur (°C)_skaliert"], label = "COVID-19")
plt.scatter(group3["CRP (mg/dl)_skaliert"], group3["Temperatur (°C)_skaliert"], label = "BaktPneumonie")

plt.xlabel("CRP (mg/dl)_skaliert")
plt.ylabel("Temperatur (°C)_skaliert")

plt.legend()

## Hierarchical Clustering

In [None]:
hierarchical = hierarchy.linkage(data[["CRP (mg/dl)_skaliert",
                                       "Temperatur (°C)_skaliert"]], 
                                       method = "ward")

hierarchy.dendrogram(hierarchical)

plt.show()

In [None]:
hierarchical = AgglomerativeClustering(n_clusters = 3, linkage = "ward")

model_hierarchical = hierarchical.fit(data[["CRP (mg/dl)_skaliert", "Temperatur (°C)_skaliert"]])

labels = model_hierarchical.labels_

In [None]:
data["cluster"] = labels

print(data)

In [None]:
group1 = data[data.cluster == 0]
group2 = data[data.cluster == 1]
group3 = data[data.cluster == 2]

plt.scatter(group1["CRP (mg/dl)_skaliert"], group1["Temperatur (°C)_skaliert"], label = "gesund")
plt.scatter(group2["CRP (mg/dl)_skaliert"], group2["Temperatur (°C)_skaliert"], label = "COVID-19")
plt.scatter(group3["CRP (mg/dl)_skaliert"], group3["Temperatur (°C)_skaliert"], label = "BaktPneumonie")

plt.xlabel("CRP (mg/dl)")
plt.ylabel("Temperatur (°C)")

plt.legend()

## Ellbogen- und Silhouetten-Methode

In [None]:
Sum_of_squared_distances = []
k = range(1, 10) 

for num_clusters in k:
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(data[["CRP (mg/dl)_skaliert", "Temperatur (°C)_skaliert"]])
    Sum_of_squared_distances.append(kmeans.inertia_)

plt.plot(k, Sum_of_squared_distances, "o-")
plt.xlabel("k") 
plt.ylabel("Inertia") 

In [None]:
silhouette_avg = []
range_n_clusters = range(2, 10)

for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(data[["CRP (mg/dl)_skaliert","Temperatur (°C)_skaliert"]])
    cluster_labels = kmeans.labels_

    silhouette_avg.append(silhouette_score(data[["CRP (mg/dl)_skaliert",
                                                 "Temperatur (°C)_skaliert"]],
                                                 cluster_labels))

plt.plot(range_n_clusters, silhouette_avg, "o-")
plt.xlabel("k")
plt.ylabel("Silhouettenkoeffizient")