<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Asymmetric-Distance-computation" data-toc-modified-id="Asymmetric-Distance-computation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Asymmetric Distance computation</a></span><ul class="toc-item"><li><span><a href="#cython-version" data-toc-modified-id="cython-version-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>cython version</a></span></li></ul></li><li><span><a href="#Speed-up-precompute_adc-function" data-toc-modified-id="Speed-up-precompute_adc-function-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Speed up <code>precompute_adc</code> function</a></span></li></ul></div>

In [1]:
%load_ext cython
%timeit

import Cython

In [2]:
Cython.__version__

'0.29.24'

In [3]:
import numpy as np

## Asymmetric Distance computation

Currently the code performs

```
dists = np.sum(self.dtable[range(M), codes], axis=1)
```
which is equivalent to 

```python
dists = np.zeros((N, )).astype(np.float32)
for n in range(N):
    for m in range(M):
        dists[n] += self.dtable[m][codes[n][m]]

```

In [4]:
M = 32
np.random.seed(123)

n_cluster = 256
dtable = np.array(np.random.random((M, n_cluster)), 'float32')

np.random.seed(123)
pq_codes_batch = np.array([np.random.randint([M]*M)])
N, M = pq_codes_batch.shape


In [5]:
pq_codes_batch

array([[30, 13, 30,  2, 28,  2,  6, 17, 19, 10, 27, 25, 22,  1,  0, 17,
        30, 15,  9,  0, 14,  0, 15, 25, 19, 14, 29,  4,  0, 16,  4, 17]])

In [6]:
dtable[range(M),pq_codes_batch].sum(axis=1)

array([17.402649], dtype=float32)

In [7]:
def distances_loop_py(N,M, dtable):
    dists = np.zeros((N, )).astype(np.float32)
    for n in range(N):
        for m in range(M):
            dists[n] += dtable[m, pq_codes_batch[n,m]]
    return dists

In [10]:
distances_loop_py(1,M,dtable)

array([17.402647], dtype=float32)

In [11]:
dtable.shape

(32, 256)

In [12]:
%timeit distances_loop_py(1,M,dtable)

12.5 µs ± 184 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [52]:
%timeit dtable[range(M),pq_codes_batch].sum(axis=1)

6.81 µs ± 28.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### cython version

In [21]:
%%cython
cimport numpy as cnp
cimport cython
             
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_pqcode_to_codebook(long M,float[:,:] dtable,long[:] pq_code):
    cdef float dist = 0
    cdef int m
    
    for m in range(M):
        dist += dtable[m, pq_code[m]]

    return dist

In file included from /Users/davidbuchaca1/.ipython/cython/_cython_magic_8006e507a74b416c5d19d9789a3b7999.c:649:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarrayobject.h:12:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarraytypes.h:1944:
 ^
  0, /*tp_print*/
  ^
/Users/davidbuchaca1/opt/anaconda3/include/python3.8/cpython/object.h:260:5: note: 'tp_print' has been explicitly marked deprecated here
    Py_DEPRECATED(3.8) int (*tp_print)(PyObject *, FILE *, int);
    ^
/Users/davidbuchaca1/opt/anaconda3/include/python3.8/pyport.h:515:54: note: expanded from macro 'Py_DEPRECATED'
#define Py_DEPRECATED(VERSION_UNUSED) __attribute__((__deprecated__))
                                                     ^
  0, /*tp_print*/
  ^
/Users/

In [22]:
pq_code = pq_codes_batch.flatten()

In [23]:
%timeit dist_pqcode_to_codebook(M, dtable, pq_code)

483 ns ± 3.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


We can make the method work for generic types but this will have a penalty when called from python 

In [39]:
%%cython
cimport numpy as cnp
cimport cython
from cython cimport integral, floating
             
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef floating distances_loop_cy2(integral M,
                                  floating[:,:] dtable,
                                  integral[:] pq_code):
    cdef floating dist = 0
    cdef integral m 
    
    for m in range(M):
        dist += dtable[m, pq_code[m]]

    return dist

In [36]:
%timeit distances_loop_cy2(M, dtable, pq_code)

1.84 µs ± 5.78 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


##### Batched version of dist_pqcodes_to_codebooks

In [90]:
%%cython 

# distutils: language = c++

from libcpp.vector cimport vector

cimport numpy as np
cimport cython
import numpy as np

from cpython.array cimport array, clone

@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline float dist_pqcode_to_codebook(long M, float[:,:] dtable,long[:] pq_code):
    cdef float dist = 0
    cdef int m
    
    for m in range(M):
        dist += dtable[m, pq_code[m]]

    return dist

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_pqcodes_to_codebooks(long M, float[:,:] dtable, long[:,:] pq_codes):
    cdef:
        int m, loops
        int N = pq_codes.shape[0] 
        float[:] dists = np.empty(N, dtype=np.float32)
    
    for n in range(N):
        dists[n] = dist_pqcode_to_codebook(M, dtable, pq_codes[n,:])

    return np.array(dists) 
             

In [91]:
dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch)

array([17.402647], dtype=float32)

In [92]:
%timeit dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch)

2.33 µs ± 15.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Iin the batched problem we have the problem that we have to create a numpy array at runtime to store the distances. oreover we need to slice `pq_codes[n,:]` to call `dist_pqcode_to_codebook`.

We can improve this by instead of defining a numpy vector at runtime we can use a C++ vector and append the results.

In [108]:
%%cython -a

# distutils: language = c++

from libcpp.vector cimport vector

cimport numpy as np
cimport cython
import numpy as np

from cpython.array cimport array, clone

@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline float dist_pqcode_to_codebook(long M, float[:,:] dtable,long[:] pq_code):
    cdef float dist = 0
    cdef int m
    
    for m in range(M):
        dist += dtable[m, pq_code[m]]

    return dist

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_pqcodes_to_codebooks(long M, float[:,:] dtable, long[:,:] pq_codes):
    cdef:
        int m, loops
        int N = pq_codes.shape[0] 
        #float[:] dists = np.empty(N, dtype=np.float32)
        vector[float] dists
    
    for n in range(N):
        dists.push_back(dist_pqcode_to_codebook(M, dtable, pq_codes[n,:]))

    return np.asarray(dists)
             

In file included from /Users/davidbuchaca1/.ipython/cython/_cython_magic_3b75e4a9e53a02e8c58a6c123a0a9526.cpp:703:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarrayobject.h:12:
In file included from /Users/davidbuchaca1/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarraytypes.h:1944:
 ^
  0, /*tp_print*/
  ^
/Users/davidbuchaca1/opt/anaconda3/include/python3.8/cpython/object.h:260:5: note: 'tp_print' has been explicitly marked deprecated here
    Py_DEPRECATED(3.8) int (*tp_print)(PyObject *, FILE *, int);
    ^
/Users/davidbuchaca1/opt/anaconda3/include/python3.8/pyport.h:515:54: note: expanded from macro 'Py_DEPRECATED'
#define Py_DEPRECATED(VERSION_UNUSED) __attribute__((__deprecated__))
                                                     ^
  0, /*tp_print*/
  ^
/User

In [109]:
dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch)

array([17.40264702])

In [110]:
%timeit dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch)

1.85 µs ± 19.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [115]:
pq_codes_batch_50 = np.vstack([pq_codes_batch for i in range(50)])

In [116]:
%timeit dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch_50)

7.06 µs ± 102 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [117]:
%timeit dtable[range(M),pq_codes_batch_50].sum(axis=1)

14.6 µs ± 315 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


If we don't need an output as a numpy array (a list is fine) we can go with the following implementation which is 2x faster

In [120]:
%%cython

# distutils: language = c++

from libcpp.vector cimport vector

cimport numpy as np
cimport cython
import numpy as np

from cpython.array cimport array, clone

@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline float dist_pqcode_to_codebook(long M, float[:,:] dtable,long[:] pq_code):
    cdef float dist = 0
    cdef int m
    
    for m in range(M):
        dist += dtable[m, pq_code[m]]

    return dist

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_pqcodes_to_codebooks(long M, float[:,:] dtable, long[:,:] pq_codes):
    cdef:
        int m, loops
        int N = pq_codes.shape[0] 
        #float[:] dists = np.empty(N, dtype=np.float32)
        vector[float] dists
    
    for n in range(N):
        dists.push_back(dist_pqcode_to_codebook(M, dtable, pq_codes[n,:]))

    return dists
             

In [122]:
%timeit dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch)

653 ns ± 2.08 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [121]:
%timeit dist_pqcodes_to_codebooks(M, dtable, pq_codes_batch_50)

3.54 µs ± 61.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Speed up `precompute_adc` function

In [87]:

import sklearn
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

from pqlite.core.codec.pq import PQCodec

D = 128
n_clusters = 256
M = 32

D = 128 # dimentionality / number of features
top_k = 100
n_cells = 18
Nt = 5000
d_subvector = int(D/M)
n_subvectors = M


np.random.seed(123)
Xtr, Xte =train_test_split(make_blobs(n_samples = Nt, n_features = D)[0].astype(np.float32), test_size=20)

pq = PQCodec(D, M, n_clusters)

In [88]:
D, M, d_subvector, n_subvectors

(128, 32, 4, 32)

In [89]:
pq.fit(Xtr)

In [90]:
codebooks = pq.codebooks
codebooks.shape

(32, 256, 4)

In [91]:
codebooks[0].shape

(256, 4)

In [77]:
%timeit pq.precompute_adc(Xtr[4,:])

513 µs ± 5.48 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [92]:
query = Xtr[4,:]
#distance_table_from_class = pq.precompute_adc(query)
#distance_table_from_class.dtable.shape

In [93]:
dtable = pq.precompute_adc(Xtr[4,:])
dtable.dtable.shape

(32, 256)

In [179]:

def precompute_adc3(query, 
                    d_subvector,
                    n_clusters,
                    codebooks):
    n_subvectors = int(D/d_subvector)
    dtable = np.empty((int(D/d_subvector), n_clusters), dtype=np.float32)

    for m in range(n_subvectors):
        query_sub = query[m * d_subvector : (m + 1) * d_subvector]
        print(query_sub)
        dtable[m, :] = np.linalg.norm(codebooks[m] - query_sub, axis=1) ** 2

    return dtable

dtable_custom = precompute_adc3(query, d_subvector, n_clusters, codebooks)
dtable_custom.shape

[ 5.7007833 -2.8085563  4.471332  -6.107677 ]
[ 2.1323197  7.2024508 10.226158  -8.758023 ]
[-0.9764044  -5.2295575  -0.61654216  1.1029097 ]
[-6.669144 -4.418286  9.053021  2.149657]
[-0.20022273  6.226501    4.7971654  -9.555226  ]
[ 5.864654   8.026492  -7.0119166  6.246779 ]
[-2.7369807 -5.6644206  3.464692  -9.996476 ]
[ 4.0115952  8.87224    3.0414064 -2.5239882]
[-0.89868146  5.2955227   1.101024   -8.345474  ]
[ 2.6671724  6.4567213 -4.048157   8.562549 ]
[ 4.8343267  2.780991   5.7438736 -7.9671087]
[ 8.198433   6.8236904  8.358163  -8.474051 ]
[-8.6224785 -5.8501544 -2.6554124 -2.8096726]
[-0.15733108 -5.373121   -6.6942797   6.8998747 ]
[ -2.203237     6.036231   -10.329724     0.59619546]
[ 9.968843   0.6003177 -6.3680234  5.591793 ]
[-3.0840025  3.7997324 -9.29037    3.2687526]
[-10.0336895    5.4669385    0.23829271  -7.2484384 ]
[-0.29932633 -9.099288    3.412477   11.089039  ]
[ 6.5156407  1.5724932 -7.9759307  3.3829281]
[ 3.6251895 -7.4156084  3.79495    0.7784265]
[-

(32, 256)

In [115]:
np.testing.asserts_almost_equal(dtable_custom, dtable.dtable)

In [117]:
codebooks.shape

(32, 256, 4)

In [132]:
codebooks[0][]

array([[ 5.5934744 , -3.1530006 ,  5.988966  , -8.190983  ],
       [ 3.3398757 , -2.598072  , -4.629112  ,  1.2890896 ],
       [ 2.0987072 ,  6.911557  , -9.568957  , -3.4636445 ],
       ...,
       [ 7.0042305 , -3.128198  ,  4.700182  , -7.6985955 ],
       [ 2.3924043 , -4.9966908 , -4.1939397 ,  0.57079446],
       [ 4.9000993 , -3.3311853 , -4.8454537 ,  1.0463784 ]],
      dtype=float32)

We can avoid slices to use less memory and improve speed

In [150]:
codebooks[0].shape

(256, 4)

In [154]:
codebooks.shape, M

((32, 256, 4), 32)

In [256]:
def precompute_adc3(query, 
                    d_subvector,
                    n_clusters,
                    codebooks):
    
    n_subvectors = int(D/d_subvector)
    dtable = np.empty((int(D/d_subvector), n_clusters), dtype=np.float32)
    query_subvec = np.empty(d_subvector, dtype=np.float32)
    query_subcodeword = np.empty(d_subvector, dtype=np.float32)
    
    for m in range(n_subvectors):
        
        # load m'th subquery
        i = 0
        for k in range(m * d_subvector, (m + 1) * d_subvector):
            query_subvec[i] = query[k]
            i += 1
            
        for ind_prototype in range(n_clusters):
            
            # load prototype ind_prototype for the m'th subspace
            for i in range(d_subvector):
                query_subcodeword[i] = codebooks[m, ind_prototype, i]
            
            # load prototype ind_prototype for the m'th subspace
            dist_subprototype_to_subquery = 0
            for k in range(d_subvector):
                coord_k = query_subcodeword[k] - query_subvec[k]
                dist_subprototype_to_subquery += coord_k * coord_k
            
            dtable[m, ind_prototype] = dist_subprototype_to_subquery
            
    return dtable

dtable_custom2 = precompute_adc3(query, d_subvector, n_clusters, codebooks)
dtable_custom2.shape

(32, 256)

In [183]:
#np.testing.assert_array_almost_equal(dtable_custom, dtable_custom2)
np.mean(dtable_custom - dtable_custom2)

-1.3184126e-07

We can cythonize the previous function

In [None]:
%%cython -a 
cimport numpy as cnp
import numpy as np
cimport cython
from cython.cimports.libc.stdlib import malloc, free
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free

             
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef float[:,:] precompute_adc_cy(float[:] query, 
                        long d_subvector,
                        long n_clusters,
                        float[::,:,:] codebooks):
            
    cdef int D = len(query)
    cdef int M = int(D/d_subvector) 
    cdef int n_subvectors = int(D/d_subvector)
    cdef int m,i,k,ind_prototype,j
    
    cdef float[::,:] dtable = np.empty((M, n_clusters), dtype=np.float32)
    cdef float[::] query_subvec = np.empty(d_subvector, dtype=np.float32)
    cdef float[::] query_subcodeword = np.empty(d_subvector, dtype=np.float32)
    
    #cdef double* query_subcodeword = <double*> PyMem_Malloc( d_subvector * sizeof(double))
        
    cdef float dist_subprototype_to_subquery, coord_j
    
    for m in range(n_subvectors):
        
        # load m'th subquery
        i = 0
        for k in range(m * d_subvector, (m + 1) * d_subvector):
            query_subvec[i] = query[k]
            i += 1
            
        for ind_prototype in range(n_clusters):
            
            # load prototype ind_prototype for the m'th subspace
            for i in range(d_subvector):
                query_subcodeword[i] = codebooks[m, ind_prototype, i]
            
            # load prototype ind_prototype for the m'th subspace
            dist_subprototype_to_subquery = 0.
            for j in range(d_subvector):
                coord_j = query_subcodeword[j] - query_subvec[j]
                dist_subprototype_to_subquery += coord_j * coord_j
            
            dtable[m, ind_prototype] = dist_subprototype_to_subquery
    
    free(query_subcodeword)
    
    return dtable


In [307]:
%timeit dtable_custom3 = precompute_adc_cy(query, d_subvector, n_clusters, codebooks)

47.8 µs ± 191 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


This is around a factor of 10x over the 'numpy vectorized' version

In [278]:
%timeit pq.precompute_adc(Xtr[4,:])

470 µs ± 7.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
