In [1]:
from sklearn import metrics
from sklearn.cluster import KMeans
from fast_ivf.kmeans import MiniBatchKMeans
from sklearn.datasets import make_blobs

clustering_metrics = [
    metrics.homogeneity_score,
    metrics.completeness_score,
    metrics.v_measure_score,
    metrics.adjusted_rand_score,
    metrics.adjusted_mutual_info_score,
]

In [2]:


n_samples = 200000
n_features = 128
random_state = 170

X, y = make_blobs(n_samples=n_samples, n_features=n_features, random_state=random_state)
X.shape, y.shape

((200000, 128), (200000,))

In [3]:
%%time

kmeans = KMeans(init="random", n_clusters=10, n_init=4, random_state=0)
kmeans.fit(X);

CPU times: user 15.6 s, sys: 57.5 ms, total: 15.7 s
Wall time: 2.83 s


In [4]:
[(m.__name__, m(y, kmeans.labels_)) for m in clustering_metrics]

[('homogeneity_score', 1.0000000000000009),
 ('completeness_score', 0.4790918206636857),
 ('v_measure_score', 0.647818903425092),
 ('adjusted_rand_score', 0.3697403582013092),
 ('adjusted_mutual_info_score', 0.6478095576464267)]

In [5]:
kmeans.predict(X[:10])

array([8, 7, 9, 4, 2, 5, 2, 7, 4, 2], dtype=int32)

In [7]:
%%time

kmeans = MiniBatchKMeans(num_centroids=10, batch_size=64, tol=0.00005)
kmeans.train(X)
kmeans.add(X)

Convergence delta = 0.00005:  26%|██▋       | 1318/5000 [00:00<00:01, 2595.99it/s]
Assigning: 100%|██████████| 3125/3125 [00:00<00:00, 81092.85it/s]


CPU times: user 580 ms, sys: 73.8 ms, total: 654 ms
Wall time: 577 ms


In [8]:
[(m.__name__, m(y, kmeans._labels)) for m in clustering_metrics]

[('homogeneity_score', 1.0000000000000007),
 ('completeness_score', 0.49261411860441723),
 ('v_measure_score', 0.660068952134806),
 ('adjusted_rand_score', 0.4130456102918084),
 ('adjusted_mutual_info_score', 0.6600597608067874)]

In [9]:
kmeans.predict(X[:10])

array([3, 7, 7, 5, 4, 6, 4, 3, 6, 0])