In [1]:
import numpy as np

In [2]:
%load_ext cython
import os
os.environ["CC"] = 'gcc-8'

In [89]:
%%cython -c=-Ofast -c=-march=native -f
#cython: initializedcheck=False, nonecheck=False, boundscheck=False, wraparound=False, cdivision=True, overflowcheck=False, overflowcheck.fold=False
import numpy as np
cimport numpy as np
cimport cython
from scipy.linalg.cython_blas cimport sgemm
from cpython cimport array
import array
from cython cimport view

ctypedef fused floating_t:
    np.float32_t
    np.float64_t

# numpy array
cpdef void kmeans_chunk_np(np.ndarray[floating_t, ndim=2, mode='c'] X_chunk,
                           np.ndarray[floating_t, ndim=2, mode='c'] C,
                           np.ndarray[floating_t, ndim=2, mode='c'] sums,
                           np.ndarray[np.int32_t, ndim=1, mode='c'] pops):
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0],
        Py_ssize_t n_clusters = C.shape[0],
        Py_ssize_t n_features = C.shape[1]
        
        floating_t x, sq_dist, min_sq_dist = 0.0
        Py_ssize_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci

        pops[best_cluster] += 1 
        for fi in xrange(n_features):  
            sums[best_cluster, fi] += X_chunk[si, fi]

# pointer   
cdef void kmeans_chunk_ptr(floating_t *X_chunk,
                           floating_t *C,
                           floating_t *sums,
                           np.int32_t *pops,
                           Py_ssize_t n_samples_chunk,
                           Py_ssize_t n_clusters,
                           Py_ssize_t n_features) nogil:
    cdef:
        floating_t x, sq_dist, min_sq_dist = 0.0
        np.int32_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si * n_features + fi] - C[ci * n_features + fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci

        pops[best_cluster] += 1             
        for fi in xrange(n_features):    
            sums[best_cluster * n_features + fi] += X_chunk[si * n_features + fi]
        

cpdef kmeans_chunk_ptrw(floating_t[:, ::1] X_chunk,
                        floating_t[:, ::1] C,
                        floating_t[:, ::1] sums,
                        np.int32_t[::1] pops):
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0]
        Py_ssize_t n_features = X_chunk.shape[1]
        Py_ssize_t n_clusters = C.shape[0]
        
    kmeans_chunk_ptr(&X_chunk[0,0], &C[0,0], &sums[0,0], &pops[0], n_samples_chunk, n_clusters, n_features)
    
    
# memoryview          
cpdef void kmeans_chunk_mv(floating_t[:, ::1] X_chunk,
                           floating_t[:, ::1] C,
                           floating_t[:, ::1] sums,
                           np.int32_t[::1] pops) nogil:
    cdef:
        Py_ssize_t n_samples_chunk = X_chunk.shape[0]
        Py_ssize_t n_clusters = C.shape[0]
        #Py_ssize_t n_features = X_chunk.shape[1]
        Py_ssize_t n_features = 2

        floating_t x, sq_dist, min_sq_dist = 0.0
        Py_ssize_t best_cluster = -1

        Py_ssize_t si, ci, fi = 0

    for si in xrange(n_samples_chunk):
        min_sq_dist = 10000000
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = 0.0
            for fi in xrange(n_features):
                x = X_chunk[si, fi] - C[ci, fi]
                sq_dist += x * x
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                best_cluster = ci
                
        pops[best_cluster] += 1          
        for fi in xrange(n_features):      
            sums[best_cluster, fi] += X_chunk[si, fi]
            

# memoryview 2    
cpdef void kmeans_chunk_gemm(np.float32_t[:, ::1] X_chunk,
                             np.float32_t[:, ::1] C,
                             np.float32_t[::1] C_snorms,
                             np.float32_t[:, ::1] sums,
                             np.int32_t[::1] pops):
    cdef:
        int n_samples_chunk = X_chunk.shape[0]
        int n_clusters = C.shape[0]
        int n_features = X_chunk.shape[1]
        
        np.float32_t x, sq_dist, max_sq_dist = 0.0
        int best_cluster = -1

        int si, ci, fi, fc= 0
        
        np.float32_t[:, ::1] dots = view.array(shape=(n_samples_chunk, n_clusters), itemsize=sizeof(np.float32_t), format="f")
        
        np.float32_t alpha = 1.0
        np.float32_t beta = 0.0
        char *transa = 'n'
        char *transb = 't'
        np.float32_t *a0=&X_chunk[0,0]
        np.float32_t *b0=&C[0,0]
        np.float32_t *c0=&dots[0,0]
        
    sgemm(transb, transa, &n_clusters, &n_samples_chunk, &n_features, &alpha, b0, &n_features, a0, &n_features, &beta, c0, &n_clusters)

    for si in xrange(n_samples_chunk):
        max_sq_dist = -10000000.0
        best_cluster = -1
        for ci in xrange(n_clusters):
            sq_dist = C_snorms[ci] + dots[si, ci]
            if sq_dist > max_sq_dist:
                max_sq_dist = sq_dist
                best_cluster = ci
                
        pops[best_cluster] += 1          
        for fi in xrange(n_features):      
            sums[best_cluster, fi] += X_chunk[si, fi]
    

In [86]:
n_samples = 100000
n_clusters = 1000
n_features = 10
n_samples_chunk = 1000

data = np.random.random_sample((n_samples, n_features)).astype(np.float32)
centroids = data[np.random.choice(np.arange(data.shape[0]), n_clusters, replace=False)]
centroids_snorms = (centroids**2).sum(axis=1) * -0.5
data_chunk = data[:n_samples_chunk]

In [65]:
%%timeit -n 100
centroids_sum = np.zeros((n_clusters, n_features), dtype=np.float32)
centroids_pop = np.zeros(n_clusters, dtype=np.int32)

kmeans_chunk_np(data_chunk, centroids, centroids_sum, centroids_pop)
# numpy

4.54 ms ± 524 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [58]:
%%timeit -n 100
centroids_sum = np.zeros((n_clusters, n_features), dtype=np.float32)
centroids_pop = np.zeros(n_clusters, dtype=np.int32)

kmeans_chunk_ptrw(data_chunk, centroids, centroids_sum, centroids_pop)
# pointer

4.15 ms ± 353 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [77]:
%%timeit -n 100
centroids_sum = np.zeros((n_clusters, n_features), dtype=np.float32)
centroids_pop = np.zeros(n_clusters, dtype=np.int32)

kmeans_chunk_mv(data_chunk, centroids, centroids_sum, centroids_pop)
# memoryview

1.26 ms ± 53.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [87]:
%%timeit -n 100
centroids_sum = np.zeros((n_clusters, n_features), dtype=np.float32)
centroids_pop = np.zeros(n_clusters, dtype=np.int32)

kmeans_chunk_gemm(data_chunk, centroids, centroids_snorms, centroids_sum, centroids_pop)
# mem view + gemm

1.52 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [44]:
data = data.astype(np.float64)
centroids = centroids.astype(np.float64)
centroids_snorms = (centroids**2).sum(axis=1) * -0.5
data_chunk = data[:n_samples_chunk]

centroids_sum = np.zeros((n_clusters, n_features), dtype=np.float64)
centroids_pop = np.zeros(n_clusters, dtype=np.int32)

In [45]:
%%timeit -n 100
kmeans_chunk_np(data_chunk, centroids, centroids_sum, centroids_pop)
# numpy + float64

24.3 ms ± 474 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [48]:
%%timeit -n 100
kmeans_chunk_ptrw(data_chunk, centroids, centroids_sum, centroids_pop)
# pointer + float64

24.3 ms ± 569 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
%%timeit -n 100
kmeans_chunk_mv(data_chunk, centroids, centroids_sum, centroids_pop)
# mem view + float64

4.25 ms ± 74.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
def kmeans_chunk_py(X_chunk, C, sums, pops):
    n_samples_chunk = X_chunk.shape[0]
    n_clusters = C.shape[0]
    n_features = X_chunk.shape[1]

    C_snorms = (C**2).sum(axis=1)
    X_chunk_snorms = (X_chunk**2).sum(axis=1)
    
    D = np.empty((n_samples_chunk, n_clusters), dtype=C.dtype)
    
    np.dot(X_chunk, C.T, out=D)
    D *= -2
    D += C_snorms + X_chunk_snorms
    
    best_c = np.argmin(D, axis=1)
    
    pops = np.bincount(best_c)
    
    for s in range(n_samples_chunk):
        sums[best_c[s]] += X_chunk[s]

In [79]:
%%timeit -n 100
kmeans_chunk_py(data_chunk, centroids, centroids_sum, centroids_pop)
# python + float64

6.32 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
