In [1]:
import numpy as np
from fastembed import TextEmbedding

## 1. Embedding the Query

Embed the query: 'I just discovered the course. Can I join now?'. Use the 'jinaai/jina-embeddings-v2-small-en' model.

You should get a numpy array of size 512.

What's the minimal value in this array?

In [4]:
dir(TextEmbedding)

['EMBEDDINGS_REGISTRY',
 'METADATA_FILE',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_model_description',
 '_list_supported_models',
 'add_custom_model',
 'decompress_to_cache',
 'download_file_from_gcs',
 'download_files_from_huggingface',
 'download_model',
 'embed',
 'embedding_size',
 'get_embedding_size',
 'list_supported_models',
 'passage_embed',
 'query_embed',
 'retrieve_model_gcs']

In [3]:
 TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [13]:
MODEL = 'jinaai/jina-embeddings-v2-small-en'

embedding_generator = TextEmbedding(MODEL)

dir(embedding_generator)

['EMBEDDINGS_REGISTRY',
 'METADATA_FILE',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_embedding_size',
 '_get_model_description',
 '_list_supported_models',
 '_local_files_only',
 'add_custom_model',
 'cache_dir',
 'decompress_to_cache',
 'download_file_from_gcs',
 'download_files_from_huggingface',
 'download_model',
 'embed',
 'embedding_size',
 'get_embedding_size',
 'list_supported_models',
 'model',
 'model_name',
 'passage_embed',
 'query_embed',
 'retrieve_model_gcs',
 'threads']

In [10]:
QUERY_1 = 'I just discovered the course. Can I join now?'

EMBEDDINGS_1 = list(embedding_generator.embed([QUERY_1]))[0]

[np.float64(-0.11726373551188797), np.float64(-0.11390704900216968), np.float64(-0.10993315241766108), np.float64(-0.10002002566126772), np.float64(-0.0990923940127248), np.float64(-0.09791743140235433), np.float64(-0.09760669466036077), np.float64(-0.09484544406877056), np.float64(-0.0900752289323632), np.float64(-0.08873296941672207), np.float64(-0.08835649651724337), np.float64(-0.08776025423533446), np.float64(-0.08585818588643464), np.float64(-0.08533977218377005), np.float64(-0.08251141106770632), np.float64(-0.0803499849369055), np.float64(-0.07802898795843768), np.float64(-0.07704189620264709), np.float64(-0.07648079261156313), np.float64(-0.07639463728739372), np.float64(-0.07638033962177995), np.float64(-0.07548960693705196), np.float64(-0.07305555140645403), np.float64(-0.07302608177518038), np.float64(-0.07302301320380362), np.float64(-0.07283665725790175), np.float64(-0.07282576410966378), np.float64(-0.07180559953250298), np.float64(-0.0714069569985842), np.float64(-0.070

In [16]:
# Check the type of EMBEDDINGS_1
print(f"Type of EMBEDDINGS_1: {type(EMBEDDINGS_1)}")

# If it's a numpy array, sorted() converts it to a list first
# Let's see the first few and last few values when sorted
sorted_embeddings = sorted(EMBEDDINGS_1)
print(f"\nFirst 5 smallest values: {sorted_embeddings[:5]}")
print(f"Last 5 largest values: {sorted_embeddings[-5:]}")

# The minimum value (first element after sorting)
print(f"\nMinimum value using sorted()[0]: {sorted_embeddings[0]}")

# More efficient way using numpy
if isinstance(EMBEDDINGS_1, np.ndarray):
    print(f"Minimum value using np.min(): {np.min(EMBEDDINGS_1)}")
else:
    # Convert to numpy array first
    arr = np.array(EMBEDDINGS_1)
    print(f"Minimum value using np.min(): {np.min(arr)}")

Type of EMBEDDINGS_1: <class 'numpy.ndarray'>

First 5 smallest values: [np.float64(-0.11726373551188797), np.float64(-0.11390704900216968), np.float64(-0.10993315241766108), np.float64(-0.10002002566126772), np.float64(-0.0990923940127248)]
Last 5 largest values: [np.float64(0.10176959273633132), np.float64(0.1055643948512088), np.float64(0.11117786806311172), np.float64(0.12317111034564432), np.float64(0.13307955253468784)]

Minimum value using sorted()[0]: -0.11726373551188797
Minimum value using np.min(): -0.11726373551188797


# Cosine similarity

The vectors that our embedding model returns are already normalized: their length is 1.0.

You can check that by using the norm function:

import numpy as np
np.linalg.norm(q)

Which means that we can simply compute the dot product between two vectors to learn the cosine similarity between them.

For example, if you compute the cosine of the query vector with itself, the result will be 1.0:

q.dot(q)

## Q2. Cosine similarity with another vector

Now let's embed this document:

doc = 'Can I still join the course after the start date?'

What's the cosine similarity between the vector for the query and the vector for the document?

In [18]:
QUERY_2 = 'Can I still join the course after the start date?'
Q2_DOCS = [QUERY_2]
EMBEDDINGS_2 = list(embedding_generator.embed(Q2_DOCS))[0]

print(EMBEDDINGS_1.dot(EMBEDDINGS_2))

0.9008528856818037


## Q3. Ranking by cosine

For Q3 and Q4 we will use these documents:

In [19]:
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

Compute the embeddings for the text field, and compute the cosine between the query vector and all the documents.

What's the document index with the highest similarity? (Indexing starts from 0):

In [23]:
Q3_EMBEDDINGS = list(embedding_generator.embed([d['text'] for d in documents]))

similarities = [EMBEDDINGS_1.dot(e) for e in Q3_EMBEDDINGS]

print(similarities)

[np.float64(0.7629684493123693), np.float64(0.8182378361919107), np.float64(0.8085397290762828), np.float64(0.7133078539597724), np.float64(0.7304499528359614)]


Looks like Document 1 is the winner.

### Alternative elegant solution using numpy and batch embedding:

In [24]:
# More elegant solution leveraging numpy and fastembed features

# Extract all document texts efficiently
doc_texts = [doc['text'] for doc in documents]

# Embed all documents in one batch - fastembed returns a generator
doc_embeddings = np.array(list(embedding_generator.embed(doc_texts)))

# Compute all cosine similarities at once using matrix multiplication
# Since embeddings are normalized, dot product = cosine similarity
cosine_similarities = doc_embeddings @ EMBEDDINGS_1

# Find the index with highest similarity
best_match_idx = np.argmax(cosine_similarities)

# Display results in a clean format
print(f"Query: '{QUERY_1}'")
print(f"\nCosine similarities:")
for idx, (sim, doc) in enumerate(zip(cosine_similarities, documents)):
    print(f"[{idx}] {sim:.4f} - {doc['question']}")
    
print(f"\nBest match: Document {best_match_idx}")
print(f"Question: {documents[best_match_idx]['question']}")
print(f"Similarity: {cosine_similarities[best_match_idx]:.4f}")

# Verify it matches your result
assert best_match_idx == np.argmax(similarities), "Results should match!"
print("\n✓ Results match your implementation!")

Query: 'I just discovered the course. Can I join now?'

Cosine similarities:
[0] 0.7630 - Course - Can I still join the course after the start date?
[1] 0.8182 - Course - Can I follow the course after it finishes?
[2] 0.8085 - Course - When will the course start?
[3] 0.7133 - Course - What can I do before the course starts?
[4] 0.7304 - How can we contribute to the course?

Best match: Document 1
Question: Course - Can I follow the course after it finishes?
Similarity: 0.8182

✓ Results match your implementation!


## Q4. Ranking by cosine, version two

Now let's calculate a new field, which is a concatenation of question and text:

full_text = doc['question'] + ' ' + doc['text']

Embed this field and compute the cosine between it and the query vector. What's the highest scoring document?

In [30]:
# Extract all document texts efficiently
doc_texts2 = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Embed all documents in one batch - fastembed returns a generator
doc_embeddings2 = np.array(list(embedding_generator.embed(doc_texts2)))

# Compute all cosine similarities at once using matrix multiplication
# Since embeddings are normalized, dot product = cosine similarity
cosine_similarities = doc_embeddings2 @ EMBEDDINGS_1

# Find the index with highest similarity
best_match_idx = np.argmax(cosine_similarities)

# Display results in a clean format
print(f"Query: '{QUERY_1}'")
print(f"\nCosine similarities:")
for idx, (sim, doc) in enumerate(zip(cosine_similarities, documents)):
    print(f"[{idx}] {sim:.4f} - {doc['question']}")
    
print(f"\nBest match: Document {best_match_idx}")
print(f"Question: {documents[best_match_idx]['question']}")
print(f"Similarity: {cosine_similarities[best_match_idx]:.4f}")

# Verify it matches your result
# assert best_match_idx == np.argmax(similarities), "Results should match!"
print("\n✓ Results match your implementation!")

Query: 'I just discovered the course. Can I join now?'

Cosine similarities:
[0] 0.8515 - Course - Can I still join the course after the start date?
[1] 0.8437 - Course - Can I follow the course after it finishes?
[2] 0.8408 - Course - When will the course start?
[3] 0.7755 - Course - What can I do before the course starts?
[4] 0.8086 - How can we contribute to the course?

Best match: Document 0
Question: Course - Can I still join the course after the start date?
Similarity: 0.8515

✓ Results match your implementation!


## Q5. Selecting the embedding model

Now let's select a smaller embedding model. What's the smallest dimensionality for models in fastembed?

One of these models is BAAI/bge-small-en. Let's use it.

In [32]:
print(sorted([m['dim'] for m in  TextEmbedding.list_supported_models()]))

[384, 384, 384, 384, 384, 384, 512, 512, 512, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 768, 1024, 1024, 1024, 1024, 1024, 1024]


In [37]:
print([m for m in TextEmbedding.list_supported_models() if m['model'] == "BAAI/bge-small-en"])

[{'model': 'BAAI/bge-small-en', 'sources': {'hf': 'Qdrant/bge-small-en', 'url': 'https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz', '_deprecated_tar_struct': True}, 'model_file': 'model_optimized.onnx', 'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.', 'license': 'mit', 'size_in_GB': 0.13, 'additional_files': [], 'dim': 384, 'tasks': {}}]


## Q6. Indexing with qdrant (2 points)

For the last question, we will use more documents.

We will select only FAQ records from our ml zoomcamp:


In [38]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        doc['qa_text'] = doc['question'] + ' ' + doc['text']
        documents.append(doc)


In [42]:
documents[1]

{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 'section': 'General course-related questions',
 'question': 'Is it going to be live? When?',
 'course': 'machine-learning-zoomcamp'}

-----
Add them to qdrant using the model form Q5.

When adding the data, use both question and answer fields:

text = doc['question'] + ' ' + doc['text']

After the data is inserted, use the question from Q1 for querying the collection.

What's the highest score in the results? (The score for the first returned record):

In [39]:
from qdrant_client import QdrantClient, models

In [40]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [43]:
collection_name = "mlz-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=384,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [None]:
points = []
id = 0

for course in documents:
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=doc['qa_text'], model=model_handle),
        payload={
            "text": doc['qa_text'],
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1
