## FAISS intro 

Reproduction of the code in [FAISS tutorial](https://www.pinecone.io/learn/faiss-tutorial/)

In [1]:
import requests
from io import StringIO
import pandas as pd
import numpy as np

### Take the data

In [2]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
# create dataframe
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [3]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [4]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have ~4.5K unique sentences

4802

In [5]:
# getting more data

urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [6]:
# each of these dataset have the same structure, so we loop through each creating our sentences data
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data_1 = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data_1[1].tolist())
    sentences.extend(data_1[2].tolist())

In [7]:
len(set(sentences))

14505

In [8]:
# removing nan and duplicated
sentences = pd.Series([word for word in list(set(sentences)) if type(word) is str])
len(sentences)

14504

In [9]:
# Creating a BERT model using sentence transformers

from sentence_transformers import SentenceTransformer
# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences.to_list())
sentence_embeddings.shape

(14504, 768)

## Plan and simple similarity search using IndexFlatL2

In [10]:
import faiss

In [11]:
# get the vector size 
d = sentence_embeddings.shape[1]
d

768

In [12]:
# generating the index
index = faiss.IndexFlatL2(d)

In [13]:
# is the index trained?
index.is_trained

True

In [14]:
# addeding our embeddings (vectorized text)
index.add(sentence_embeddings)
# see the total
index.ntotal

14504

In [15]:
# searching similarities the k nearest neigbors between xq and our data 
k = 4
xq = model.encode(["Someone sprints with a football"])

In [16]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 9884   340 12395  5399]]
CPU times: user 17 ms, sys: 28 µs, total: 17 ms
Wall time: 15.4 ms


In [17]:
# verify our text
sentences.iloc[I[0]]

9884     A group of football players is running in the ...
340      A group of people playing football is running ...
12395            Two groups of people are playing football
5399     A person playing football is running past an o...
dtype: object

In [18]:
# extracting the numerical vector from faiss
# we have 4 vectors to return (k) - so we initialize a zero array to hold them
vecs = np.zeros((k, d))
# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

In [19]:
vecs.shape

(4, 768)

In [20]:
vecs[0][:100]

array([ 0.01627034,  0.22325909, -0.15037404, -0.30747247, -0.27122435,
       -0.10593174, -0.06460953,  0.04738246, -0.73349065, -0.37657726,
       -0.76762778,  0.16902889,  0.53107685,  0.51176643,  1.14415836,
       -0.08562846, -0.67240065, -0.96637088,  0.02545462, -0.2155983 ,
       -1.25656593, -0.82982177, -0.09825007, -0.21850885,  0.50610268,
        0.10527948,  0.50396878,  0.65242952, -1.39458692,  0.65847504,
       -0.21525346, -0.22487436,  0.81818366,  0.08464289, -0.76141709,
       -0.28928319, -0.09825802, -0.7304616 ,  0.07855845, -0.84354568,
       -0.59242058,  0.77471358, -1.20920539, -0.22757953, -1.30733609,
       -0.23081474, -1.31322527,  0.01629097, -0.97285479,  0.19308148,
        0.47424552,  1.18920887, -1.96741259, -0.70061088, -0.29638764,
        0.60533708,  0.62407476, -0.70340389, -0.86754155,  0.17673104,
       -0.19170557, -0.02951936,  0.22623543, -0.1669542 , -0.80402559,
       -0.45918953,  0.69675452, -0.24928206, -1.01478684, -0.92

## Partitioning the index (IVFFlat)

In [21]:
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [22]:
index.is_trained

False

In [23]:
index.train(sentence_embeddings)
index.is_trained  # check if index is now trained

True

In [25]:
index.add(sentence_embeddings)
index.ntotal  # number of embeddings indexed

14504

In [26]:
%%time
D, I = index.search(xq, k)  # search
print(I)
# FlatL2 result [[ 9884   340 12395  5399]]

[[ 9884   340 12395  5399]]
CPU times: user 1.01 ms, sys: 973 µs, total: 1.98 ms
Wall time: 1.09 ms


In [27]:
# verify our text
sentences.iloc[I[0]]

9884     A group of football players is running in the ...
340      A group of people playing football is running ...
12395            Two groups of people are playing football
5399     A person playing football is running past an o...
dtype: object

In [28]:
# increasin the nprobe 
index.nprobe = 10

In [29]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 9884   340 12395  5399]]
CPU times: user 3.1 ms, sys: 2.57 ms, total: 5.67 ms
Wall time: 4.6 ms


In [30]:
# for the vector reconstruction we need to add direct map
index.make_direct_map()

In [31]:
index.reconstruct(7460)[:100]

array([-0.28991434,  0.54199064, -0.27298486,  0.10417995, -0.33969614,
       -0.4506219 ,  0.07291928, -0.3549848 , -0.8369879 , -0.32433224,
       -1.2478453 ,  0.10843043,  0.29072765,  0.21705268,  0.60392576,
        0.06538893, -0.6706807 , -0.90537196,  0.02427153, -0.24707308,
       -1.2672976 , -0.97827107,  0.15829204, -0.13239901,  0.7912185 ,
        0.51121765,  0.05511359,  0.7540963 , -1.5350547 ,  0.53369606,
       -0.04551981, -0.18737014,  0.6334436 ,  0.23146886, -1.0518087 ,
       -0.7283868 ,  0.82900846, -0.9575755 ,  0.272913  ,  0.21210498,
       -0.4041685 ,  0.977175  , -1.1903669 , -0.5085015 , -0.8816648 ,
       -0.03428968, -0.8209083 , -0.07884365, -0.637655  ,  0.01346982,
        0.7326747 ,  1.2296643 , -1.6466395 , -1.2337519 , -0.65219116,
        0.15154126,  0.5415216 , -0.52801603, -1.2121469 , -0.17640916,
       -1.4374218 , -0.17282893, -0.01257554, -0.3825809 , -0.3267509 ,
       -0.30086067,  0.15054575, -0.0587858 , -1.2200756 , -1.02

## Quantization

it works in this way
1. We split the original vector into several subvectors.
2. For each set of subvectors, we perform a clustering operation — creating multiple centroids for each sub-vector set.
3. In our vector of sub-vectors, we replace each sub-vector with the ID of it’s nearest set-specific centroid.

In [32]:
# the worflow
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

In [33]:
index.is_trained

False

In [34]:
# training the index
index.train(sentence_embeddings)

In [35]:
# adding our vectors
index.add(sentence_embeddings)

In [36]:
# setting the nbrobe
index.nprobe = 10  # align to previous IndexIVFFlat 

In [38]:
%%time
D, I = index.search(xq, k)
print(I)

# FlatL2 result [[ 9884   340 12395  5399]] (15.4 ms) 
# IVFFlat result nbrobe = 10 [[ 9884   340 12395  5399]] (4.6 ms)

[[ 340 1121 1304  174]]
CPU times: user 1.64 ms, sys: 0 ns, total: 1.64 ms
Wall time: 1.2 ms
