In [1]:
from time import time
from numba import jit, njit, prange, float32
import numpy as np
from joblib import Memory
from sklearn.cluster.k_means_ import _k_init
from sklearn.datasets import make_blobs
from sklearn.utils import check_random_state

In [66]:
m = Memory(location='/tmp/joblib')
make_blobs = m.cache(make_blobs)
_k_init = m.cache(_k_init)
n_clusters = 1000
rng = check_random_state(42)
kmeanspp_size = int(1e4)

data, true_labels = make_blobs(n_samples=int(1e5), centers=100,
                               n_features=100, cluster_std=30,
                               random_state=rng)
data = data.astype(np.float32)
data_squared_norms = np.sum(data[:kmeanspp_size] * data[:kmeanspp_size],
                            axis=1)
centroids = _k_init(data[:kmeanspp_size], n_clusters,
                    data_squared_norms, rng)

In [3]:
@njit('void(f4[:, ::1], f4[:, ::1], f4[:, ::1], u4[::1])',
      locals={'best_dist': float32, 'dist': float32},
      fastmath=True,
#       parallel=True,
)
def kmeans_kernel(data, centroids, centroids_sum, centroids_pop):
    n_samples, n_features = data.shape
    n_centroids = centroids.shape[0]
    for i in range(n_samples):
        best_dist = 1e7
        best_j = 0
        for j in range(n_centroids):
            dist = 0.
            for k in range(n_features):
                dist += (data[i, k] - centroids[j, k]) ** 2
            if dist < best_dist:
                best_dist = dist
                best_j = j
        for k in range(data.shape[1]):
            centroids_sum[best_j, k] += data[i, k]
        centroids_pop[best_j] += 1

In [69]:
data_chunk = data[:1000]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype=np.uint32)

In [70]:
%%timeit
kmeans_kernel(data_chunk, centroids, centroids_sum, centroids_pop)

12.5 ms ± 587 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%load_ext cython

In [13]:
import os
os.environ["CC"] = 'gcc-8'

In [82]:
%%cython -c=-Ofast -c=-mavx2 -c=-fprefetch-loop-arrays -f
cimport numpy as np
cimport cython
import array
from cpython cimport array
from libc.string cimport memcpy

ctypedef fused floating_t:
    np.float32_t
    np.float64_t

@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void kmeans_chunk_np(np.ndarray[float, ndim=2] X_chunk,
                           np.ndarray[float, ndim=2] C,
                           np.ndarray[float, ndim=2] sums,
                           np.ndarray[int, ndim=1] pops):
    cdef:
        int n_samples_chunk = X_chunk.shape[0],
        int n_clusters = C.shape[0],
        int n_features = C.shape[1]
        
        float x = 0.0
        float sq_dist = 0.0
        float max_sq_dist = -1.0
        int best_cluster = -1

        int si, ci, fi = 0

    # pairwise dist argmin min 
    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci

        #labels[s_idx] = best_cluster            #update labels
        pops[best_cluster] += 1                 #update pops
        for fi in xrange(n_features):        #update sums
            sums[best_cluster, fi] += X_chunk[si, fi]    
    
@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void kmeans_chunk_(np.float32_t *X_chunk,
                        np.float32_t *C,
                        np.float32_t *sums,
                        np.int32_t *pops,
                        Py_ssize_t n_samples_chunk,
                        Py_ssize_t n_clusters,
                        Py_ssize_t n_features) nogil:
    cdef:
        np.float32_t x = 0.0
        np.float32_t sq_dist = 0.0
        np.float32_t max_sq_dist = -1.0
        np.int32_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0
        
        np.float32_t *C_start = C
        np.float32_t *X_start = X_chunk
        

    # pairwise dist argmin min 
    X_start = X_chunk
    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000
        best_cluster = -1
        C_start = C
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = (X_start + fi)[0] - (C_start + fi)[0]
                sq_dist += x * x
            C_start += n_features
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci
        X_start += n_features

        #labels[s_idx] = best_cluster            #update labels
        pops[best_cluster] += 1                 #update pops
        for fi in xrange(n_features):        #update sums
            (sums + best_cluster * n_features + fi)[0] += (X_chunk + si * n_features + fi)[0]

@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef kmeans_chunk_py(np.float32_t[:, ::1] X_chunk,
                      np.float32_t[:, ::1] C,
                      np.float32_t[:, ::1] sums,
                      np.int32_t[::1] pops):
    n_samples_chunk, n_features,_,_,_,_,_,_ = X_chunk.shape
    n_clusters = C.shape[0]
    kmeans_chunk_(&X_chunk[0,0], &C[0,0], &sums[0,0], &pops[0], n_samples_chunk, n_clusters, n_features)
    
    
@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void kmeans_chunk_a(np.float32_t *X_chunk,
                         np.float32_t *C,
                         np.float32_t *sums,
                         int *pops,
                         Py_ssize_t n_samples_chunk,
                         Py_ssize_t n_clusters,
                         Py_ssize_t n_features) nogil:
    cdef:
        np.float32_t x = 0.0
        np.float32_t sq_dist = 0.0
        np.float32_t max_sq_dist = -1.0
        np.int32_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0
        
        np.float32_t *C_start = C
        np.float32_t *X_start = X_chunk
        

    # pairwise dist argmin min 
    X_start = X_chunk
    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000
        best_cluster = -1
        C_start = C
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = (X_start + fi)[0] - (C_start + fi)[0]
                sq_dist += x * x
            C_start += n_features
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci
        X_start += n_features

        #labels[s_idx] = best_cluster            #update labels
        pops[best_cluster] += 1                 #update pops
        for fi in xrange(n_features):        #update sums
            (sums + best_cluster * n_features + fi)[0] += (X_chunk + si * n_features + fi)[0]
            
@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef kmeans_chunk_py_a(np.float32_t[::1] X_chunk,
                        np.float32_t[::1] C,
                        np.float32_t[::1] sums,
                        np.int32_t[::1] pops):
    cdef INT_ARRAY = array.array('i')
    cdef FLOAT_ARRAY = array.array('f')
    
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0] / 100
        Py_ssize_t n_features = 100
        Py_ssize_t n_clusters = C.shape[0] / 100
        
        array.array X_chunk_a = array.clone(FLOAT_ARRAY, n_samples_chunk * n_features, False) 
        array.array C_a = array.clone(FLOAT_ARRAY, n_clusters * n_features, False) 
        array.array sums_a = array.clone(FLOAT_ARRAY, n_clusters * n_features, True) 
        array.array pops_a = array.clone(INT_ARRAY, n_clusters, True) 
        
        float *xptr = X_chunk_a.data.as_floats
        float *cptr = C_a.data.as_floats
        float *sptr = sums_a.data.as_floats
        int *pptr = pops_a.data.as_ints
        
        float x = 0.0
        float sq_dist = 0.0
        float max_sq_dist = -1.0
        int best_cluster = -1

        Py_ssize_t si, ci, fi = 0
        
        float *C_start = cptr
        float *X_start = xptr
        
    memcpy(xptr, &X_chunk[0], n_samples_chunk * n_features * sizeof(int)) 
    memcpy(cptr, &C[0], n_samples_chunk * n_features * sizeof(int)) 
    
    X_start = xptr
    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000
        best_cluster = -1
        C_start = cptr
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = (X_start + fi)[0] - (C_start + fi)[0]
                sq_dist += x * x
            C_start += n_features
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci
        X_start += n_features

        #labels[s_idx] = best_cluster            #update labels
        pptr[best_cluster] += 1                  #update pops
        for fi in xrange(n_features):            #update sums
            (sptr + best_cluster * n_features + fi)[0] += (xptr + si * n_features + fi)[0]
            


@cython.cdivision(True)  
@cython.initializedcheck(False)
@cython.nonecheck(False)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void kmeans_chunk_mv(float[::1] X_chunk,
                           float[::1] C,
                           float[::1] sums,
                           float[::1] pops,
                           Py_ssize_t n_samples_chunk,
                           Py_ssize_t n_clusters,
                           Py_ssize_t n_features):
    cdef:
        float x = 0.0
        float sq_dist = 0.0
        float max_sq_dist = -1.0
        Py_ssize_t best_cluster = -1

        Py_ssize_t si, ci, fi, s, c = 0

    # pairwise dist argmin min 
    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci

        #labels[s_idx] = best_cluster            #update labels
        pops[best_cluster] += 1                 #update pops
        for fi in xrange(n_features):        #update sums
            sums[best_cluster, fi] += X_chunk[si, fi]    

In [21]:
data_chunk = data[:1000].copy()
centroids_sum = np.zeros_like(centroids)
centroids_snorms = (centroids**2).sum(axis=1)
centroids_pop = np.zeros(centroids.shape[0], dtype='int32')
labels = np.empty(100000, dtype=int)

In [22]:
%%timeit
kmeans_chunk_np(data_chunk, centroids, centroids_sum, centroids_pop)

102 ms ± 186 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [73]:
%%timeit
data_chunk = data[:1000]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype='int32')
kmeans_chunk_py(data_chunk, centroids, centroids_sum, centroids_pop)

20 ms ± 799 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [63]:
#from libc.string cimport memset
#memset(a.data.as_voidptr, 0, len(a) * sizeof(int))

In [83]:
%%timeit
data_chunk = data[:1000]
centroids_sum = np.zeros_like(centroids)
centroids_pop = np.zeros(centroids.shape[0], dtype='int32')
kmeans_chunk_py_a(data_chunk.reshape(1000*100), centroids.reshape(1000*100), centroids_sum.reshape(1000*100), centroids_pop)

12.3 ms ± 56.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
