<a href="https://colab.research.google.com/github/junxnone/examples/blob/master/algo/faiss_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install faiss numpy

Collecting faiss
[?25l  Downloading https://files.pythonhosted.org/packages/bd/1c/4ae6cb87cf0c09c25561ea48db11e25713b25c580909902a92c090b377c0/faiss-1.5.3-cp36-cp36m-manylinux1_x86_64.whl (4.7MB)
[K     |████████████████████████████████| 4.7MB 9.1MB/s 
Installing collected packages: faiss
Successfully installed faiss-1.5.3


In [2]:
!apt install libomp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 28 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 0s (2,114 kB/s)
Selecting previously unselected package libomp5:amd64.
(Reading database ... 132681 files and directories currently installed.)
Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ...
Unpacking libomp5:amd64 (5.0.1-1) ...
Selecting previously unselected package libomp-dev.
Preparing to unpack .../libomp-dev_5.0.1-1_amd64.deb ...
Unpacking libomp-dev (5.0.

In [0]:
import faiss
import numpy as np

In [4]:
d = 1024                           # dimension
nb = 64                      # database size
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
# xb[:, 0] += np.arange(nb) / 1000.
print(xb.shape)

(64, 1024)


# FlatL2

In [5]:
index = faiss.IndexFlatL2(d)   # build the index
index.add(xb)                  # add vectors to the index
print(index.ntotal)

64


In [6]:
t1=np.random.random((1, d)).astype('float32')
print(t1.shape)
D, I = index.search(t1, 5)     # actual search
print(I)
print(D)

(1, 1024)
[[48 40 29 55 54]]
[[157.486   159.61536 159.80894 163.25558 164.56345]]


#FlatIP

In [7]:
index_ip = faiss.IndexFlatIP(d)   # build the index
index_ip.add(xb)                  # add vectors to the index
print(index_ip.ntotal)

print(t1.shape)
D, I = index_ip.search(t1, 5)     # actual search
print(I)
print(D)

64
(1, 1024)
[[40 53 54 51 29]]
[[274.67383 274.55933 273.45782 272.37405 271.29324]]


# IndexIVFFlat


In [8]:
nlist = 50
k = 5
quantizer = faiss.IndexFlatL2(d)  # the other index
index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist)
assert not index_ivf.is_trained
index_ivf.train(xb)
assert index_ivf.is_trained

index_ivf.add(xb)                  # add may be a bit slower as well
D, I = index_ivf.search(t1, k)     # actual search
print(I)                            # neighbors of the 5 last queries

for npb in [5,10,20]:
  index_ivf.nprobe = npb              # default nprobe is 1, try a few more
  D, I = index_ivf.search(t1, k)
  print("nprobe = {}：   I ： {}".format(npb,I))   # neighbors of the 5 last queries


[[48 59 24 -1 -1]]
nprobe = 5：   I ： [[48 29 54 38 27]]
nprobe = 10：   I ： [[48 40 29 54 38]]
nprobe = 20：   I ： [[48 40 29 55 54]]


> default nprobe = 1，只有第一个是相同的  
> nprobe = 10 ， 则和FlatL2 完全相同
·

# Kmeans

In [9]:
ncentroids = 7
niter = 200
verbose = True
d = xb.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(xb)

4813.1504

In [10]:
D, I = kmeans.index.search(xb, 1)
print(I.T)
print(D.T)

[[1 3 5 4 3 5 6 4 2 2 6 0 3 3 6 4 6 4 6 2 4 0 6 6 6 3 4 3 2 6 2 3 6 6 5 3
  4 4 3 6 6 4 3 5 4 3 2 2 1 2 6 5 6 6 3 2 2 4 2 3 4 2 5 3]]
[[39.16687  81.2424   71.83099  74.82605  80.40521  66.74335  77.26056
  78.24255  79.058105 76.436646 79.79193  39.23761  82.380615 80.03906
  78.1051   72.03485  78.605774 75.52905  87.35718  77.765625 79.619385
  39.237793 80.89673  78.6095   77.430176 73.90021  73.64612  83.806854
  80.81787  76.93744  71.69946  76.954895 82.90747  74.43231  71.99042
  80.73645  81.57654  75.703735 72.405396 81.13452  79.72034  74.12305
  81.19641  68.994934 82.51813  81.76337  78.07623  76.30304  39.166687
  79.2229   77.878296 69.590454 77.61139  79.16815  80.833496 77.67084
  76.64105  76.6427   79.579834 74.54474  80.09155  79.50922  72.045715
  79.75543 ]]


In [11]:
index = faiss.IndexFlatL2(d)
index.add(xb)
D, I = index.search (kmeans.centroids, 1)
print(kmeans.centroids.shape)
print(I)

(7, 1024)
[[11]
 [ 0]
 [30]
 [38]
 [15]
 [ 5]
 [33]]


# PCA

In [17]:
mt = np.random.rand(1000, 40).astype('float32')
mat = faiss.PCAMatrix (40, 10)
mat.train(mt)
assert mat.is_trained
tr = mat.apply_py(mt)
# print this to show that the magnitude of tr's columns is decreasing
print(mt.shape)
print(tr.shape)
print((tr ** 2).sum(0))

(1000, 40)
(1000, 10)
[114.77626  112.0039   108.320045 106.406044 103.98856  102.30232
 101.09535   98.141975  95.62195   95.1056  ]


# PQ encoding / decoding
## 1 ProductQuantizer 

In [26]:
d = 32  # data dimension
cs = 4  # code size (bytes)

# train set 
nt = 10000
xt = np.random.rand(nt, d).astype('float32')

# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode 
codes = pq.compute_codes(x)

# decode
x2 = pq.decode(codes)

# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(x.shape)
print(codes.shape)
print(x2.shape)
print(type(x[0][0]))
print(type(codes[0][0]))
print(type(x2[0][0]))
print(avg_relative_error)

(20000, 32)
(20000, 4)
(20000, 32)
<class 'numpy.float32'>
<class 'numpy.uint8'>
<class 'numpy.float32'>
0.066108525


# 2 ScalarQuantizer

In [27]:
d = 32  # data dimension

# train set 
nt = 10000
xt = np.random.rand(nt, d).astype('float32')

# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')

# QT_8bit allocates 8 bits per dimension (QT_4bit also works)
sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
sq.train(xt)

# encode 
codes = sq.compute_codes(x)

# decode
x2 = sq.decode(codes)

# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()

print(x.shape)
print(codes.shape)
print(x2.shape)
print(type(x[0][0]))
print(type(codes[0][0]))
print(type(x2[0][0]))
print(avg_relative_error)


(20000, 32)
(20000, 32)
(20000, 32)
<class 'numpy.float32'>
<class 'numpy.uint8'>
<class 'numpy.float32'>
3.8492058e-06
