In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
%matplotlib inline

In [None]:
K = 5
x, _ = make_blobs(n_samples=K*5, n_features=2, centers=K, cluster_std=2, center_box=(-50, 30))
plt.scatter(x[:, 0], x[:, 1], s=50, alpha=0.5)

In [None]:
distance = lambda x, y: np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2)

In [None]:
for i, p in enumerate(x):
    dist = np.array([distance(p, q) for q in x if not np.all(p == q)])
    print('%3d：min=%.5f：max=%.5f：mean=%.5f' %(i, dist.min(), dist.max(), dist.mean()))

### k-means

In [None]:
def k_means(x, K, centroids=None, epochs=10):
    N = x.shape[0]
    if centroids is None:
        centroids = x[np.random.choice(N, K, replace=False)]
    clusters = np.zeros(N)
    for _ in range(epochs):
        ex = np.array(centroids.copy())
        # 距離が一番近い中心点を取得
        clusters = np.array([np.argmin(np.sum((centroids - x[n]) ** 2, axis=1)) for n in range(N)])
        # 中心の移動
        centroids = np.array([x[clusters == k].mean(axis=0) for k in range(K)])
        #[np.sum((centroids[k] - x[clusters == k]) ** 2).mean() for k in range(K)]
        if np.all(ex == centroids): break
    return clusters, centroids

In [None]:
clusters, centroids = k_means(x, K, None, 100)
plt.scatter(x[:, 0], x[:, 1], c=clusters, s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=10)

### sklearn版k-means

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=K, init='random', n_init=10, max_iter=x.size, tol=1e-4, random_state=0)
centroids = km.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], c=clusters, s=50, alpha=0.5)

### k-means++

In [None]:
def k_means_pp(x, K, epochs=10):
    N, D = x.shape
    distance = np.zeros(N)
    centroids = np.zeros((K, D))
    # 1つ目の中心点はランダム
    r = np.repeat(1 / N, N)
    for k in np.arange(K):
        centroids[k] = x[np.random.choice(np.arange(N), 1, p=r)]
        distance = np.sum((x - centroids[k]) ** 2, axis=1)
        r = distance / np.sum(distance)
    return k_means(x, K, centroids, epochs)

In [None]:
clusters, centroids = k_means_pp(x, K, x.size)
plt.scatter(x[:, 0], x[:, 1], c=clusters, s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=10)

### sklearn版k-means++

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=K, init='k-means++', n_init=10, max_iter=x.size, tol=1e-4, random_state=0)
centroids = km.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], c=clusters, s=50, alpha=0.5)

### DBSCAN

In [None]:
def dbscan(pts, f, eps=10.0, min_pts=3):
    visited = set()
    noise = set()
    clusters = np.zeros(pts.shape[0])
    cid = 0
    for pid, P in enumerate(pts):
        if pid in visited: continue
        visited.add(pid)
        seeds = [i for i, q in enumerate(pts) if 0 < f(P, q) < eps]
        if len(seeds) < min_pts:
            noise.add(pid)
        else:
            cluster = [pid]
            while len(seeds) > 0:
                qid = seeds.pop()
                Q = x[qid]
                if not qid in visited:
                    n = [i for i, q in enumerate(pts) if 0 < f(Q, q) < eps]
                    if len(n) >= min_pts: seeds += n
                if not (qid in visited) or (qid in noise):
                    cluster.append(qid)
                    visited.add(qid)
                    if qid in noise: noise.remove(qid)
            clusters[cluster] = cid
            cid += 1
    return clusters

In [None]:
clusters = dbscan(x, distance, 6.0, 3)
plt.scatter(x[:, 0], x[:, 1], c=clusters, s=50, alpha=0.5)