In [322]:
from typing import Callable

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [323]:
df = pd.read_csv("./../data.csv", names=["x1", "x2"])
df

Unnamed: 0,x1,x2
0,-0.0824,0.9435
1,0.0913,0.9575
2,0.0476,1.0683
3,0.2311,1.0452
4,0.2269,0.9615
...,...,...
96,-0.5031,1.8191
97,-0.3340,1.8515
98,-0.3200,1.9158
99,-0.2074,1.9929


In [324]:
def plot_k_means(df: pd.DataFrame, center_points: pd.DataFrame, iteration: int, metric_name: str) -> None:
    unikalne_grupy = sorted(df["grupa"].unique())

    for grupa in unikalne_grupy:
        subset = df[df["grupa"] == grupa]
        plt.scatter(subset["x1"], subset["x2"], label=f"Grupa {grupa}")

    plt.scatter(center_points["x1"], center_points["x2"], c="red", s=100, marker="X", label='Środek grupy')

    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.legend(loc='upper right')
    plt.title(f"Iteracja {iteration}")
    plt.savefig(f"k_means{iteration}_{metric_name}.png")
    plt.clf()

In [325]:
def k_means(
        m: int,
        iters: int,
        df: pd.DataFrame,
        miara: Callable,
        plot_iters: list | None = None,
        position_iters: list | None = None,
) -> None:
    data = df.copy()
    # 1. Wybierz losowo m różnych próbek i uznaj je jako środki grup (V)
    initial_centers = data.sample(m).reset_index(drop=True)
    centers = {
        i: (r['x1'], r['x2']) for i, r in initial_centers.iterrows()
    }
    data["grupa"] = None
    data["center_x1"] = None
    data["center_x2"] = None

    # 2. Pętla wykonywana zadaną liczbę iteracji (iters
    for iter in range(iters):

        # 2.1. Pętla po wszystkich M próbkach, s to indeks aktualnej próbki
        for s, row in data.iterrows():
            distances = {}

            # 2.1.1. Wylicz odległości między próbką s a każdym środkiem grupy (V)
            for center_id, coords in centers.items():
                distance = miara((row["x1"], row["x2"]), coords)
                distances[center_id] = distance

            # 2.1.2. Wyznacz us równy indeksowi najbliższego środka grupy
            us = min(distances, key=distances.get)
            data.loc[s, "grupa"] = us

        # 2.2. Pętla po wszystkich m grupach, j to indeks aktualnej grupy
        new_centers = {}
        group_ids = range(m)
        for j in group_ids:

            # 2.2.1. Wybierz próbki, należące do tej grupy (zbiór próbek o indeksach s, takich, że us == j), niech zbiór ten nazywa się Xgr
            is_in_group = data['grupa'] == j
            Xgr = data[is_in_group]

            # 2.2.2. Jeśli zbiór Xgr jest pusty, wtedy pomiń wykonanie dalszej części tej pętli.
            if Xgr.empty:
                new_centers[j] = centers[j]
                continue

            # 2.2.3. Pętla po wszystkich atrybutach, i to index poszczególnego atrybutu
            current_new_center = []
            for attribute in ["x1", "x2"]:
                # 2.2.3.1 Wartość i-tego atrybutu grupy j-tej to średnia wartość atrybutu i-tego wszystkich próbek Xgr
                mean_value = Xgr[attribute].mean()
                df.loc[is_in_group, f"center_{attribute}"] = mean_value
                current_new_center.append(mean_value)

            new_centers[j] = tuple(current_new_center)
        centers = new_centers

        if position_iters and iter + 1 in position_iters:
            print(f"{iter + 1}")
            for g_id, (c1, c2) in centers.items():
                is_in_group = data['grupa'] == g_id
                data.loc[is_in_group, 'center_x1'] = c1
                data.loc[is_in_group, 'center_x2'] = c2
            report = data.groupby('grupa').agg(
                center_x1=('center_x1', 'first'),
                center_x2=('center_x2', 'first'),
                liczebność=('x1', 'count'),
                min_x1=('x1', 'min'),
                max_x1=('x1', 'max'),
                min_x2=('x2', 'min'),
                max_x2=('x2', 'max')
            ).reset_index()
            report.set_index('grupa', inplace=True)
            report.to_excel(f"k_means{iter + 1}_{miara.__name__}.xlsx")
            print(report)

        if plot_iters and iter + 1 in plot_iters:
            centers_df = pd.DataFrame.from_dict(
                centers,
                orient='index',
                columns=['x1', 'x2']
            )
            centers_df.index.name = 'grupa'
            plot_k_means(data, centers_df, iter + 1, miara.__name__)


In [326]:
def miara_euklidesowa(pa: tuple, pb: tuple):
    return np.sqrt((pa[0] - pb[0]) ** 2 + (pa[1] - pb[1]) ** 2)


def miara_custom(pa: tuple, pb: tuple):
    return abs(pa[0] - pb[0])

# Raport 1

In [327]:
m = 3
iters = 10

k_means(m, iters, df, miara_euklidesowa, [4, 10], [4, 10])

4
      center_x1 center_x2  liczebność  min_x1  max_x1  min_x2  max_x2
grupa                                                                
0     -1.255043 -0.691483          30 -1.7790 -0.1725 -1.5277  0.5807
1      0.880788 -0.389198          42 -0.1226  1.3375 -1.5564  0.8573
2     -0.541883  1.336572          29 -1.6673  0.5113  0.7270  1.9929
10
      center_x1 center_x2  liczebność  min_x1  max_x1  min_x2  max_x2
grupa                                                                
0     -1.126397 -0.874121          29 -1.7790 -0.0566 -1.5564  0.2806
1      0.929307 -0.333402          40  0.1669  1.3375 -1.4508  0.8573
2     -0.652487  1.261106          32 -1.7544  0.5113  0.4760  1.9929


<Figure size 640x480 with 0 Axes>

# Raport 2

In [328]:
m = 4
iters = 10

k_means(m, iters, df, miara_custom, [10], [10])

10
      center_x1 center_x2  liczebność  min_x1  max_x1  min_x2  max_x2
grupa                                                                
0      0.327768 -0.031555          22 -0.0824  0.6782 -1.4538  1.9473
1     -0.572389 -0.059858          19 -0.9805 -0.1226 -1.5564  1.9929
2     -1.508873  0.191393          30 -1.7790 -1.0784 -1.2181  1.6521
3      1.085263 -0.074683          30  0.7825  1.3375 -1.1675  0.8573


<Figure size 640x480 with 0 Axes>