In [5]:
from time import time
from numba import jit, njit, prange, float32
import numpy as np
from joblib import Memory
from sklearn.cluster.k_means_ import _k_init
from sklearn.datasets import make_blobs
from sklearn.utils import check_random_state

In [6]:
m = Memory(location='/tmp/joblib')
make_blobs = m.cache(make_blobs)
_k_init = m.cache(_k_init)
n_clusters = 1000
rng = check_random_state(42)
kmeanspp_size = int(1e4)

data, true_labels = make_blobs(n_samples=int(1e5), centers=100,
                               n_features=100, cluster_std=30,
                               random_state=rng)
data = data.astype(np.float32)
data_squared_norms = np.sum(data[:kmeanspp_size] * data[:kmeanspp_size],
                            axis=1)
centroids = _k_init(data[:kmeanspp_size], n_clusters,
                    data_squared_norms, rng)

In [3]:
@njit('void(f4[:, ::1], f4[:, ::1], f4[:, ::1], u4[::1])',
      locals={'best_dist': float32, 'dist': float32},
      fastmath=True,
#       parallel=True,
)
def kmeans_kernel(data, centroids, centroids_sum, centroids_pop):
    n_samples, n_features = data.shape
    n_centroids = centroids.shape[0]
    for i in range(n_samples):
        best_dist = 1e7
        best_j = 0
        for j in range(n_centroids):
            dist = 0.
            for k in range(n_features):
                dist += (data[i, k] - centroids[j, k]) ** 2
            if dist < best_dist:
                best_dist = dist
                best_j = j
        for k in range(data.shape[1]):
            centroids_sum[best_j, k] += data[i, k]
        centroids_pop[best_j] += 1

In [4]:
data_chunk = data[:1000]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype=np.uint32)

In [5]:
%%timeit
kmeans_kernel(data_chunk, centroids, centroids_sum, centroids_pop)

11.8 ms ± 22 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [1]:
%load_ext cython

In [2]:
import os
os.environ["CC"] = 'gcc-8'

In [7]:
%%cython -c=-Ofast -c=-march=native -f
#cython: initializedcheck=False, nonecheck=False, boundscheck=False, wraparound=False, cdivision=True, overflowcheck=False, overflowcheck.fold=False

cimport numpy as np
cimport cython

ctypedef fused floating_t:
    np.float32_t
    np.float64_t

# numpy array
cpdef void kmeans_chunk_np(np.ndarray[floating_t, ndim=2, mode='c'] X_chunk,
                           np.ndarray[floating_t, ndim=2, mode='c'] C,
                           np.ndarray[floating_t, ndim=2, mode='c'] sums,
                           np.ndarray[np.int32_t, ndim=1, mode='c'] pops):
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0],
        Py_ssize_t n_clusters = C.shape[0],
        Py_ssize_t n_features = C.shape[1]
        
        floating_t x, sq_dist, min_sq_dist = 0.0
        Py_ssize_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci

        pops[best_cluster] += 1 
        for fi in xrange(n_features):  
            sums[best_cluster, fi] += X_chunk[si, fi]

# pointer   
cdef void kmeans_chunk_ptr(floating_t *X_chunk,
                           floating_t *C,
                           floating_t *sums,
                           np.int32_t *pops,
                           Py_ssize_t n_samples_chunk,
                           Py_ssize_t n_clusters,
                           Py_ssize_t n_features) nogil:
    cdef:
        floating_t x, sq_dist, min_sq_dist = 0.0
        np.int32_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si * n_features + fi] - C[ci * n_features + fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci

        pops[best_cluster] += 1             
        for fi in xrange(n_features):    
            sums[best_cluster * n_features + fi] += X_chunk[si * n_features + fi]
        

cpdef kmeans_chunk_ptrw(floating_t[:, ::1] X_chunk,
                        floating_t[:, ::1] C,
                        floating_t[:, ::1] sums,
                        np.int32_t[::1] pops):
    n_samples_chunk = X_chunk.shape[0]
    n_features = X_chunk.shape[1]
    n_clusters = C.shape[0]
    kmeans_chunk_ptr(&X_chunk[0,0], &C[0,0], &sums[0,0], &pops[0], n_samples_chunk, n_clusters, n_features)
    
    
# memoryview          
cpdef void kmeans_chunk_mv(floating_t[:, ::1] X_chunk,
                           floating_t[:, ::1] C,
                           floating_t[:, ::1] sums,
                           np.int32_t[::1] pops) nogil:
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0]
        Py_ssize_t n_clusters = C.shape[0]
        Py_ssize_t n_features = X_chunk.shape[1]

        floating_t x, sq_dist, min_sq_dist = 0.0
        Py_ssize_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci
                
        pops[best_cluster] += 1          
        for fi in xrange(n_features):      
            sums[best_cluster, fi] += X_chunk[si, fi]

In [8]:
%%timeit
data_chunk = data[:1000]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype=np.int32)
kmeans_chunk_np(data_chunk,
                centroids,
                centroids_sum,
                centroids_pop)

15.2 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
%%timeit
data_chunk = data[:10]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype=np.int32)
kmeans_chunk_ptrw(data_chunk,
                  centroids,
                  centroids_sum,
                  centroids_pop)

168 µs ± 3.43 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [37]:
%%timeit
data_chunk = data[:10]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype=np.int32)
kmeans_chunk_mv(data_chunk,
                centroids,
                centroids_sum,
                centroids_pop)

172 µs ± 837 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
