In [None]:
# if you use google colab use this line
# if you have local installation please make sure to use this version of matplotlib
# otherwise, some of the visualizations may not work
!pip install matplotlib==3.7.0



In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial import Voronoi, voronoi_plot_2d
import imageio

from IPython.display import Image
from copy import deepcopy

np.random.seed(42)

In [None]:
# helper functions for visualization
def build_figure(samples, centroids, labels, voronoi=True):

    fig = plt.figure(figsize=(9, 6))
    plt.rc('grid', linestyle="--", color='gray', alpha=0.5)

    if voronoi:
        vor = Voronoi(np.array(centroids))
        fig = voronoi_plot_2d(vor, show_vertices=False, show_points=False, ax=plt.gca())

    plt.scatter(samples[:, 0], samples[:, 1], marker='x', s=10, c=get_color_palette(labels, color_map), alpha=0.2)

    for i, c in enumerate(centroids):
        plt.scatter(centroids[i][0], centroids[i][1], marker='x', s=100, color='red')

    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    plt.title("Visualization of Samples from " + str(len(centroids)) + " Clusters")
    plt.grid(True)

    return fig

def animate_cluster(samples, centroids, labels, voronoi=True):
    figure = build_figure(samples, centroids, labels, voronoi=voronoi)
    return figure

def animate_kmeans_results(X, results, fps=2, voronoi=True):
    images = []
    for i in range(len(results)):
        fig = animate_cluster(X, results[i][0], results[i][1],  voronoi=voronoi)
        fig.canvas.draw()
        image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
        image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
        plt.close()
        images.append(image_from_plot)

    duration = int(1000 * 1 / fps)
    imageio.mimsave('./example.gif', images, fps=fps, duration=duration, loop=0)
    with open('./example.gif','rb') as f:
        display(Image(data=f.read(), format='png'))

def plot_elbow(wcss):
    plt.figure(figsize=(9, 6))
    plt.rc('grid', linestyle="--", color='gray', alpha=0.5)
    plt.plot(np.linspace(1, len(wcss), len(wcss)), wcss)

    plt.grid(True)
    plt.show()

# Task 9.1: A Structured Dataset

Create a dataset of two-dimensional vectors. Draw 1000 samples each from the normal distributions
with

$
\mu_1 = \begin{pmatrix} 1 \\ 1\end{pmatrix},
\Sigma_1 = \begin{pmatrix} 0.5 & 0.2 \\ 0.2 & 0.5 \end{pmatrix},
\mu_2 = \begin{pmatrix} 0 \\ -2\end{pmatrix},
\Sigma_2 = \begin{pmatrix} 0.3 & 0.2 \\ 0.2 & 0.2 \end{pmatrix},
\mu_3 = \begin{pmatrix} -2 \\ -1\end{pmatrix},
\Sigma_3 = \begin{pmatrix} 0.1 & 0.2 \\ 0.2 & 0.8 \end{pmatrix}
$

Each normal distribution results in one cluster of points.
Visualize the position of the points and clusters and use this set for the next exercises.

In [None]:
# helper functions for visualization
COLOR_MAP = ["green", "purple", "blue", "magenta", "orange", "gray", "pink", "maroon", "cyan", "olive"]

def get_color_palette(labels, color_map=COLOR_MAP):
    return [color_map[l] for l in labels if l < len(color_map)]

def visualize_cluster(samples, centroids, labels, voronoi=True):
    fig = plt.figure(figsize=(9, 6))
    plt.rc('grid', linestyle="--", color='gray', alpha=0.5)

    if voronoi:
        vor = Voronoi(np.array(centroids))
        fig = voronoi_plot_2d(vor, show_vertices=False, show_points=False, ax=plt.gca())

    plt.scatter(samples[:, 0], samples[:, 1], marker='x', s=10, c=get_color_palette(labels, color_map), alpha=0.2)

    for i, c in enumerate(centroids):
        plt.scatter(centroids[i][0], centroids[i][1], marker='x', s=100, color='red')

    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    plt.title("Visualization of Samples from " + str(len(centroids)) + " Clusters")
    plt.grid(True)
    plt.show()

In [None]:
# TODO: generate the data points from the three clusters and visualize the dataset (use function visualize_cluster or implement your own visualization)

# TODO: Define parameters for the normal distributions

# TODO: Generate samples from the three normal distributions

# TODO: Visualize the clusters

# Task 9.2: K-means Clustering

a) Implement the ”K-means Clustering” algorithm and use it on the dataset you created in the last task. Initialize the algorithm with $K = 3$ randomly chosen prototypes first. Then repeat the experiment with $K = 5$ and $K = 10$ prototypes.

In [None]:

class kMeans:
    def __init__(self, k=2, max_iter=1000, tol=1e-4):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol # tolerance as stopping criterion
        self.centroids = []
        self.results = []
        self.wcss = 0.0

    def fit(self, X):
        # TODO: implement K-means clustering

        # TODO: initialize centroids

        for i in range(self.max_iter):
            # store intermediate results if you want to use function animate_kmeans_results
            self.results.append([self.centroids, self.predict(X)])

            # TODO: E-Step: Assign clusters

            # TODO: M-Step: Calculate new centroids

            # TODO: Check for convergence using tolerance

            # TODO: Update centroids

    def predict(self, X):
        # Assign clusters to new data
        return self.assign_classes(X)

    def assign_classes(self, X):
        # TODO: Calculate distances to centroids
        # TODO: Assign the closest cluster to each point
        return

    def calculate_wcss(self, X):
        # TODO: task b) Calculate the within-cluster sum of squares (WCSS)
        return

In [None]:
# TODO: Run kMeans with k = 3

# TODO: visualize result (you can use visualize_cluster, animate_kmeans_results or implement your own visualization)

In [None]:
# TODO: Run kMeans with k = 5

# TODO: visualize result (you can use visualize_cluster, animate_kmeans_results or implement your own visualization)

In [None]:
# TODO: Run kMeans with k = 10

# TODO: visualize result (you can use visualize_cluster, animate_kmeans_results or implement your own visualization)

b) As we randomly initialize k clusters we iteratively adjust these k clusters till the k centroids reach an equilibrium state. However, the main thing we do before initializing these clusters is to determine how many clusters we have to use. To do so, there is the ”Elbow Method”. For every value of k = [1, ..., kmax], calculate the within-cluster sum of squares (WCSS) value. After that plot the WCSSs and determine the optimal k.

In [None]:
# TODO: In class kMeans implement the function calculate_wcss

# TODO: Calculate the WCSS for k = [1, 10] and plot the WCSS's for all k.

c) Now implement the online variant of K-means and use it on the same dataset. Choose the prototypes as above. Again visualize the steps of the algorithm. Explain the results and the differences compared to a).

In [None]:
class OnlinekMeans(kMeans):
    def __init__(self, prototypes=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.centroids = None if not prototypes else  np.array(prototypes, dtype=np.float32)
        self.counts = np.zeros(self.k)

    def fit(self, X):
        # implement Online Kmeans algorithms

        # TODO: Randomly initialize centroids

        for _ in range(self.max_iter):
            centroids_pre = deepcopy(self.centroids)

            for x in X:
                # TODO: E-Step: Find the closest centroid

                # TODO: Update the count for the closest centroid

                # TODO: M-Step: Update centroids

            # TODO: stop if change is smaller than tolerance


IndentationError: expected an indented block after 'for' statement on line 15 (<ipython-input-18-f85213fb839b>, line 24)

In [None]:
# TODO: Run online Kmeans with k = 3 and visualize result (use visualize_cluster of implement your own visualization)

d) Again, determine the optimal number of cluster centroids k for the online variant of K-means.

In [None]:
# TODO: Calculate the WCSS for k = [1, 10], plot the WCSS's for all k, and determine the optimal k.

# Task 9.3: Another Structured Dataset

a) Generate a dataset of three-dimensional vectors $(x_1, x_2, x_3)^T$ . Draw 2000 samples in the following way:

For $x_3 = 0$ draw $x_1, x_2$ normally distributed with $μ_1 = [0; 0], Σ_1 = [0.5, 0; 0, 0.5]$ (1000 samples). The
other 1000 samples should be uniformly distributed within a cylinder that has the base in the $x_1, x_2$
surface (center (0, 0), radius 0.5) and the axis along the $x_3$-axis from $x_3 = 0$ to $x_3 = 5$.


In [None]:
# helper functions for visualization

def visualize_3d(X, centroids=None, labels=None):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    if labels is not None:
        ax.scatter(X[:, 0], X[:, 1], X[:, 2], alpha=0.05, c=get_color_palette(labels, color_map))
    else:
        ax.scatter(X[:, 0], X[:, 1], X[:, 2], alpha=0.05)
    if centroids is not None:
        for i, c in enumerate(centroids):
            ax.scatter(centroids[i][0], centroids[i][1], centroids[i][2], marker='x', color='green', s=100)


    ax.set_xlabel('$x_1$')
    ax.set_ylabel('$x_2$')
    ax.set_zlabel('$x_3$')
    ax.set_title('3D Dataset Visualization')
    plt.show()

In [None]:
# TODO: generate dataset

# TODO: visualize dataset

b) Use online K-means with the following initialization on your dataset

1. More than two prototypes on the $x_3$-axis;

In [None]:
# TODO: define initial prototypes run Online Kmeans and visualize results

2. More than two prototypes on the x3-axis, one of those with $x_3 = 0$

In [None]:
# TODO: define initial prototypes run Online Kmeans and visualize results

3. Two prototypes with $x_3 = 0$

In [None]:
# TODO: define initial prototypes run Online Kmeans and visualize results

4. One prototype with $x_3 = 0$, one on the $x_1, x_2$ surface.

In [None]:
# TODO: define initial prototypes run Online Kmeans and visualize results

c) Again, determine the optimal number of cluster centroids $k$ for the online variant of K-means.


In [None]:
# TODO: calculate wcss and visualize results