In [3]:
%pip install sentence-transformers jupyter ipywidgets faiss-cpu -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


Generate embeddings

In [20]:
import sentence_transformers

model = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')

texts = [
    'The cat sat on the windowsill and watched the birds fly by.',
    'Quantum computers have the potential to revolutionize cryptography.',
    'Bananas are an excellent source of potassium and dietary fiber.',
    'Mount Everest is the highest mountain above sea level on Earth.',
    'The Mona Lisa is one of the most famous paintings in the world.',
    'A solar eclipse occurs when the Moon passes between the Earth and the Sun.',
    'Venus is the hottest planet in our solar system.',
    'The stock market fluctuates based on investor sentiment and economic data.',
    'Octopuses have three hearts and blue blood.',
    'The Great Wall of China was built to protect against invasions.'
]

ids = [
    4637834,
    4637835,
    4637836,
    4637837,
    4637838,
    4637839,
    4637840,
    4637841,
    4637842,
    4637843
]

# Encode the sentences to get their embeddings
embeddings = model.encode(texts)
print("Embeddings shape:", embeddings.shape)  # Should be (3, 384) for all-MiniLM-L6-v2
for i, embedding in enumerate(embeddings):
    print(f"Sentence {i}: {embedding[:5]}...")  # Print first 5 dimensions of the embedding

Embeddings shape: (10, 384)
Sentence 0: [ 0.12968221  0.01845985 -0.00748047  0.06867585 -0.02418645]...
Sentence 1: [-0.07364395  0.00870077 -0.04692024  0.03435629 -0.11529234]...
Sentence 2: [-0.03886006 -0.03654213  0.04637926  0.05234018 -0.02747391]...
Sentence 3: [-0.0600652   0.04444657 -0.02500929 -0.05286716 -0.06148164]...
Sentence 4: [ 0.00511048 -0.02563424  0.0117868   0.04150654 -0.02125267]...
Sentence 5: [ 0.00020727  0.04129593  0.0580074   0.03317517 -0.03564569]...
Sentence 6: [ 0.00910144 -0.06488722 -0.03345207  0.01376     0.07430761]...
Sentence 7: [ 0.06743481 -0.0599446   0.03064492  0.06001861 -0.01442595]...
Sentence 8: [ 0.01835446  0.01416127 -0.01720778  0.04222289 -0.07529112]...
Sentence 9: [-0.02330099  0.13033783  0.05522298  0.00189558  0.05454559]...


Store them

In [21]:
import faiss
import numpy as np

dim = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dim)  # L2 distance index
index_with_ids = faiss.IndexIDMap(index)  # Create an index with IDs

ids_np = np.array(ids, dtype=np.int64)  # Convert IDs to numpy array
index_with_ids.add_with_ids(embeddings, ids_np)  # Add embeddings with IDs


In [23]:
query = "eiffiel"
query_embedding = model.encode([query])

k = 2  # Number of nearest neighbors to search for
dist, i = index_with_ids.search(query_embedding, k)  # Search for nearest neighbors

print("Distances:", dist)  # Distances to the nearest neighbors
print("Indices:", i)  # Indices of the nearest neighbors

Distances: [[1.6708156 1.8310041]]
Indices: [[4637838 4637842]]


In [15]:
# Append new items to the index
new_texts = [
    'The Eiffel Tower is located in Paris, France.',
    'Artificial intelligence is transforming various industries.'
]

new_embeddings = model.encode(new_texts)
index.add(new_embeddings)  # Add new embeddings to the index