# FAISS Memory Study

This notebook tries to measure the memory consumption and speed of faiss in CPU and GPU


In [1]:
import numpy as np
import numpy.random as np_rand
import faiss

In [2]:
# Multiple GPUs
ngpus = faiss.get_num_gpus()

print("number of GPUs:", ngpus)

number of GPUs: 2


In [3]:
d=128
# N = 100000
N = 10000
emb = np.array(np.random.randint(0,2, size=(N, d)), dtype=np.float32)

In [4]:
emb.shape

(10000, 128)

In [5]:
index_flat_l2 = faiss.IndexFlatL2(d)

In [6]:
index_flat_l2.train(emb)

In [7]:
index_flat_l2.add(emb)

In [8]:
npdec = np.array(np.random.randn(1000,128), dtype=np.float32)

In [9]:
npdec.shape

(1000, 128)

In [10]:
npdec.dtype

dtype('float32')

In [11]:
index_flat_l2.d

128

In [12]:
index_flat_l2.ntotal

10000

In [13]:
%%time
k=2
D,I = index_flat_l2.search(npdec, k)

CPU times: user 1.92 s, sys: 442 ms, total: 2.36 s
Wall time: 341 ms


In [14]:
res = faiss.StandardGpuResources()

In [15]:
res

<faiss.swigfaiss.StandardGpuResources; proxy of <Swig Object of type 'faiss::gpu::StandardGpuResources *' at 0x7f753a909bd0> >

In [16]:
# Using one GPU
# res = faiss.StandardGpuResources()  # use a single GPU
# build a flat (CPU) index
index_flat = faiss.IndexFlatL2(d)

In [17]:
# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

In [18]:
res = faiss.StandardGpuResources()  # use a single GPU
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat_l2)

In [19]:
%%time
k=20
D,I = gpu_index_flat.search(npdec, k)

CPU times: user 828 µs, sys: 359 µs, total: 1.19 ms
Wall time: 746 µs


TODO check how to make it more memory efficient using float16 instead

* https://github.com/facebookresearch/faiss/issues/666
* https://github.com/facebookresearch/faiss/blob/master/gpu/utils/Float16.cu
* https://github.com/facebookresearch/faiss/issues/590

Use "lossy" indices (float16, int8, int4), instead of a complete one, this will save in space even if the computations are done in float32

Indices that I should try with the codes:

- faiss.IndexScalarQuantizer with (QT_8bit/QT_6bit/QT_4bit)
- faiss.IndexPQ

the construction can be done with index_factory: 

    index = faiss.index_factory(d, "IVF100,PQ8")
    
    
To analyze a matrix, print

    MatrixStats(my_matrix).comments (Python)

In [20]:
#TODO tests to do and measure
# isq_fp16 = faiss.IndexScalarQuantizer
isq8 = faiss.IndexScalarQuantizer(d, 8)
isq6 = faiss.IndexScalarQuantizer(d, 6)
isq4 = faiss.IndexScalarQuantizer(d, 4)
ipq8 = faiss.IndexPQ(d, 16, 8)  # with different Quantizer configurations
ipq6 = faiss.IndexPQ(d, 16, 6)  # with different Quantizer configurations
ipq4 = faiss.IndexPQ(d, 16, 4)  # with different Quantizer configurations

# these should be tested for the training and see which ones give a better result

In [21]:
iemb = np.array(np.random.randint(0,10, size=(N, d)), dtype=np.float32)
inpdec = np.array(np.random.randint(0,10, size=(N, d)), dtype=np.float32)


In [22]:
# train quantizers indices, checking build time is good, but is not so much of an issue as should be done only once
%time isq8.train(iemb)
%time isq6.train(iemb)
%time isq4.train(iemb)

CPU times: user 13 µs, sys: 5 µs, total: 18 µs
Wall time: 19.8 µs
CPU times: user 1.33 ms, sys: 574 µs, total: 1.91 ms
Wall time: 1.4 ms
CPU times: user 6 µs, sys: 3 µs, total: 9 µs
Wall time: 10 µs


In [23]:
%time ipq8.train(emb)
%time ipq6.train(emb)
%time ipq4.train(emb)

CPU times: user 5min 5s, sys: 1min 30s, total: 6min 35s
Wall time: 52.6 s
CPU times: user 5min 3s, sys: 1min 27s, total: 6min 31s
Wall time: 51.9 s
CPU times: user 2min 52s, sys: 44.4 s, total: 3min 37s
Wall time: 28.8 s


In [24]:
isq8.metric_type

1

In [25]:
# check the search time on each quantizer, this is important, as long as there are (almost) no errors
# %time isq8.search (inpdec, k)
# there is an issue with the search of non-integers?
# I need to understand this
# when using np.int as dtype in iemb it fails to create the index, 
# when using np.int as dtype in the inpdec it fails during search
# when using both np.float32 (but both created as randint [0,...]) it fails during the metric computation -> was an error during initialization of the metric_type
# now it seems to break the python kernel .... It seems that I won't be able to use it  (and I really want to due to the int index properties)

# This happens only with the 8 bit one, not with smaller int representations

In [26]:
%time isq6.search (inpdec, k)

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 1.62 ms


(array([[3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        ...,
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38]], dtype=float32),
 array([[-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        ...,
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1]]))

In [27]:
%time isq4.search (inpdec, k)


CPU times: user 19.3 ms, sys: 5.5 ms, total: 24.8 ms
Wall time: 3.79 ms


(array([[3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        ...,
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38],
        [3.4028235e+38, 3.4028235e+38, 3.4028235e+38, ..., 3.4028235e+38,
         3.4028235e+38, 3.4028235e+38]], dtype=float32),
 array([[-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        ...,
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1]]))

In [28]:
%time _ = ipq8.search (npdec, k)
%time _ = ipq6.search (npdec, k)
%time _ = ipq4.search (npdec, k)

CPU times: user 9.41 ms, sys: 3.85 ms, total: 13.3 ms
Wall time: 2.92 ms
CPU times: user 291 µs, sys: 83 µs, total: 374 µs
Wall time: 376 µs
CPU times: user 96 µs, sys: 27 µs, total: 123 µs
Wall time: 126 µs


Testing several times the ipq4 is steady under ~300micro seconds, while ipq6 varies between 0.5 to 20 milliseconds and ipq8 seems steady around 20-30ms.  This is just a first idea and non-exhaustive experiment.

In [29]:
%%time
idxpq = faiss.IndexPQ(d, 16, 8)
# before training
# index.do_polysemous_training = True
idxpq.train(emb)

CPU times: user 5min 3s, sys: 1min 27s, total: 6min 31s
Wall time: 51.8 s


In [30]:
%%time
# before searching
# index.search_type = faiss.IndexPQ.ST_polysemous
# index.polysemous_ht = 54    # the Hamming threshold
pqs = idxpq.search (npdec, k)

CPU times: user 4.3 ms, sys: 1.23 ms, total: 5.52 ms
Wall time: 2.86 ms


IndexPQ seems to be at least 5 times faster than the linear kernel

In [31]:
#polysemous training kills the python kernel!!!
# %%time
# index = faiss.IndexPQ (d, 16, 8)
# # before training
# index.do_polysemous_training = True
# index.train(emb)

In [32]:
#polysemous training kills the python kernel!!!
# %%time

# # before searching
# index.search_type = faiss.IndexPQ.ST_polysemous
# index.polysemous_ht = 54    # the Hamming threshold
# index.search (npdec, k)

In [33]:
# d = 128
nlist = 100
m = 8                             # number of subquantizers
k = 4
quantizer = faiss.IndexFlatL2(d)  # this remains the same
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
                                    # 8 specifies that each sub-vector is encoded as 8 bits
index.train(xb)
index.add(xb)
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
index.nprobe = 10              # make comparable with experiment above
D, I = index.search(xq, k)     # search
print(I[-5:])

NameError: name 'xb' is not defined

In [34]:
# finite precision ERROR on BIG numbers

# example from: https://gist.github.com/mdouze/efc94c57e2302469287b9d1a2501d277

import numpy as np

a = np.array([12345], dtype='float32')
b = np.array([12343], dtype='float32')

diff = (a * a).sum() + (b * b).sum() - 2 * np.dot(a, b)

In [35]:
diff

0.0

In [36]:
(a * a).sum(), (b * b).sum(),  2 * np.dot(a, b)

(152399020.0, 152349650.0, 304748672.0)

In [37]:
(a * a).sum() + (b * b).sum()

304748670.0

In [38]:
2 * np.dot(a, b)

304748672.0