# Vector search with sentence transformers and FAISS
`conda install faiss-gpu sentence-transformers transformers datasets accelerate`

In [None]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import torch
import numpy as np
import os
import math

In [None]:
filename ='shakespeare.txt'
if not os.path.exists(filename):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(filename, 'w') as f:
        f.write(requests.get(data_url).text)

with open(filename, 'r') as f:
    text = f.read()

In [None]:
samples = text.split('\n\n')

# Since all the lines look like CHARACTERNAME: Their lines, we want to cut off the character name so the model doesn't see that part.
clean_samples = [x.split(":")[1].strip() for x in samples]
num_samples = len(samples)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2') # Use Dot product as your metric
dmodel = model.get_sentence_embedding_dimension() # Match to the size of the model's embedding vector
print(dmodel)

In [None]:
# Precompute embeddings for the entire dataset
embeddings = [model.encode(sample, convert_to_numpy=True, device='cuda') for sample in tqdm(clean_samples)]

In [None]:
import faiss

# train_samples=num_samples // 2
quantizer = faiss.IndexFlatL2(dmodel)

# Set this to true to use an approximate, quantized index. Practically it's for massive datasets, which this is not.
do_quantized=False
nlist = int(math.sqrt(num_samples)) # Number of partitioning cells, 4 * sqrt(n) where n is the dataset size is a good heuristic apparently.

# TODO: I think Inner Product is the closest to the sentence transformers dotproduct?
# This index quantizes to reduce memory usage
if do_quantized:
    m=8 # Number of sub quantizers
    bits_per_ix = 8
    
    index = faiss.IndexIVFPQ(quantizer, dmodel, nlist, m, bits_per_ix, faiss.METRIC_INNER_PRODUCT)
else:
    # This index uses full vectors and is more accurate and memory intensive
    index = faiss.IndexIVFFlat(quantizer, dmodel, nlist, faiss.METRIC_INNER_PRODUCT)

# Uncomment to use GPU index. 
# This can slow the first time it runs b/c CUDA kernels need to build.
if torch.cuda.is_available():
    gpu_r = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(gpu_r, 0, index)

print("Training index...")
index.train(np.stack(embeddings))
print("Index is trained?:", index.is_trained)

In [None]:
# Add with ids lets you store another arbitrary vector with the vectors for retrieval
# Useful if other metadata associated with the vectors must be retrieved from elsewhere.
# In this case we're just pairing it with an index into the samples list so we can print it.
index.add_with_ids(np.stack(embeddings), np.arange(len(embeddings)))
print("Added", index.ntotal, "items")

In [None]:
index.nprobe = 16 # Probe this many partitioning cells
query = "Look, he's winding up the watch of his wit"
query_emb = model.encode([query], convert_to_tensor=True)
dists, idxs = index.search(query_emb.cpu().numpy(), 10)
for d, x in zip(dists[0], idxs[0]):
    print(f"Score: {d}")
    print(samples[x])
    print("=========================")