# Clustering mit K-Means und Klassifikation mit K-Nearest-Neighbors

## Datensatz erzeugen

In [None]:
import numpy as np
from sklearn import datasets

In [None]:
_x, _y = datasets.make_blobs(n_samples=1000, centers=3, cluster_std=[2.0, 1.0, 3.0], random_state=17, shuffle=True)
x = np.vstack((_x[_y==0], _x[_y==1][:100], _x[_y==2][:200]))
y = [0] * len(_x[_y==0]) + [1] * 100 + [2] * 200

### Daten visualisieren

## Clustering

### Optimale Anzahl Cluster finden

In [None]:
_x, _y = datasets.make_blobs(n_samples=1000, centers=3, cluster_std=[1.0, 0.5, 2.0], random_state=17, shuffle=True)
x = np.vstack((_x[_y==0], _x[_y==1][:100], _x[_y==2][:200]))
y = [0] * len(_x[_y==0]) + [1] * 100 + [2] * 200

In [None]:
possible_n_clusters = [2, 3, 4, 5]

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

for n_clusters in possible_n_clusters:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10])
    
    model = KMeans(n_clusters=n_clusters, n_init="auto")
    model.fit(x)
    
    y_pred = model.predict(x)
    
    silhouette_avg = silhouette_score(x, y_pred)
    sample_silhouette_values = silhouette_samples(x, y_pred)
    
    y_lower = 10
    
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[y_pred == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        
        
        y_lower = y_upper + 10  # 10 for the 0 samples
    
    ax1.set_ylabel("Cluster")
    ax1.set_yticks([])
    
    colors = cm.nipy_spectral(y_pred.astype(float) / n_clusters)
    ax2.scatter(
        x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )
    
    centers = model.cluster_centers_
    ax2.scatter(centers[:,0], centers[:,1], marker="o", c="black", s=200)

**Aufgabe:** Verwende `KMeans`, um den Datensatz `x` zu clustern.

In [None]:
x = np.loadtxt("04_data.txt")

## Klassifikation mit K-Nearest-Neighbors

### Datenset laden

In [None]:
from sklearn.datasets import load_wine

In [None]:
wine_data = load_wine(as_frame=True)

### Modell trainieren

### Modell quantitativ evaluieren

**Aufgabe:** Berechne Accuracy, Precision, und Recall auf den Test- und Trainingsdaten

### Confusion-Matrix

### Modell visuell evaluieren

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
fig, ax = plt.subplots()
disp = DecisionBoundaryDisplay.from_estimator(
    model,
    x_test,
    response_method="predict",
    alpha=0.5,
    ax=ax,
)

### Skalierung von Features

### Modell auf skaliertem Datensatz trainieren

**Aufgabe:**

1) KNeighborsClassifier auf skaliertem Trainings-Set trainieren

2) DecisionBoundaryDisplay-Plot auf dem skalierten Test-Set erstellen

3) Confusion-Matrix für das skalierte Test-Set errechnen