# k-mean

In [41]:
from typing import Iterable, Tuple, Sequence, Dict, List
from math import fsum, sqrt
from functools import partial
from collections import defaultdict
from pprint import pprint
from random import sample

Point = Tuple[int, ...]
Centroid = Point

def mean(data : Iterable[float]) -> float:
    data = list(data)
    return fsum(data) / len(data)

def dist(p : Point, q : Point, fsum=fsum, sqrt = sqrt, zip = zip) -> float:
    'Euclidean distance fucntion for multi-dimensional data'
    return sqrt(fsum([(x - y) ** 2 for x, y in zip(p, q)]))

def assign_data(centroids: Sequence[Centroid], data: Iterable[Point]) -> Dict[Centroid, List[Point]]:
    d = defaultdict(list)
    for point in data:
        closed_centroid = min(centroids, key = partial(dist, point))
        d[closed_centroid].append(point)
    return dict(d)

def compute_centroids(groups: Iterable[Sequence[Point]]) -> List[Centroid]:
    return [tuple(map(mean, zip(*group))) for group in groups]

def k_mean(data: Iterable[Point], k: int=2, iterations: int=50) -> List[Centroid]:
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids

In [57]:
points = [
    (10, 41, 23),
    (22, 30, 29),
    (11, 42, 5),
    (20, 32, 4),
    (12, 40, 12),
    (21, 36, 23),
]

k_mean(points, k = 2)

[(11.0, 41.0, 13.333333333333334),
 (21.0, 32.666666666666664, 18.666666666666668)]

In [60]:
centroids = k_mean(points, k = 2)
d = assign_data(centroids, points)
pprint(d)

{(11.0, 41.0, 13.333333333333334): [(10, 41, 23), (11, 42, 5), (12, 40, 12)],
 (21.0, 32.666666666666664, 18.666666666666668): [(22, 30, 29),
                                                  (20, 32, 4),
                                                  (21, 36, 23)]}
