In [1]:
!uv pip install minsearch "qdrant-client[fastembed]>=1.14.2"

[2mAudited [1m2 packages[0m [2min 29ms[0m[0m


In [2]:
import minsearch

minsearch.__version__

'0.0.4'

In [3]:
import requests
import pandas as pd

url_prefix = (
    "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
)
docs_url = url_prefix + "search_evaluation/documents-with-ids.json"
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + "search_evaluation/ground-truth-data.csv"
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient="records")

In [48]:
ground_truth[0:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

In [51]:
set([q["course"] for q in ground_truth])

{'data-engineering-zoomcamp', 'machine-learning-zoomcamp', 'mlops-zoomcamp'}

In [55]:
[q["document"] for q in ground_truth][0:3]

['c02e79ef', 'c02e79ef', 'c02e79ef']

In [4]:
from tqdm.auto import tqdm


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(q)
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

## Built-in `minsearch` search

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"], keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x136f0c560>

In [6]:
def minsearch_search(query, course):
    boost = {"question": 1.5, "section": 0.1}

    results = index.search(
        query=query, filter_dict={"course": course}, boost_dict=boost, num_results=5
    )

    return results

In [7]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = minsearch_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [8]:
hit_rate(relevance_total), mrr(relevance_total)

(0.848714069591528, 0.7288235717887772)

## Vector search from `minsearch`

In [9]:
from minsearch import VectorSearch

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [11]:
texts = []

for doc in documents:
    t = doc["question"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [12]:
df_docs = pd.DataFrame(X).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
0,0.20,0.27,0.25,0.33,0.25,0.23,0.25,0.22,0.30,0.23,...,0.10,0.11,0.09,0.28,0.25,0.14,0.19,0.22,0.01,0.20
1,-0.19,-0.34,-0.24,-0.21,-0.33,-0.14,-0.17,-0.24,-0.27,-0.08,...,0.00,0.06,0.08,0.10,0.08,0.12,0.45,0.29,0.01,-0.04
2,-0.10,-0.14,-0.11,-0.05,-0.05,0.14,-0.06,-0.08,-0.09,0.10,...,0.05,-0.08,-0.05,-0.27,-0.24,-0.08,0.10,0.13,-0.02,0.29
3,0.16,0.27,0.24,0.13,0.28,-0.01,0.13,0.22,0.18,0.11,...,-0.09,-0.09,-0.15,-0.22,-0.21,-0.14,0.34,0.23,-0.03,-0.09
4,-0.14,-0.17,-0.17,-0.02,-0.24,-0.18,0.05,-0.16,-0.24,0.03,...,-0.18,-0.09,-0.19,-0.05,-0.04,-0.03,-0.16,0.05,-0.02,0.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,-0.05,0.01,0.01,-0.04,0.01,-0.01,-0.01,-0.01,0.01,-0.04,...,-0.09,-0.04,-0.02,0.01,-0.01,0.00,-0.01,-0.09,-0.06,0.02
124,0.05,0.01,-0.01,-0.02,-0.04,0.01,0.08,-0.02,-0.01,-0.01,...,0.03,0.01,-0.01,-0.01,-0.01,-0.04,-0.11,0.02,0.01,0.05
125,0.04,-0.05,0.03,-0.06,-0.04,0.00,0.04,-0.04,-0.03,-0.04,...,-0.08,-0.02,-0.00,-0.01,-0.02,0.02,0.12,0.04,-0.03,0.12
126,0.03,0.01,-0.02,0.02,-0.01,-0.03,0.02,-0.04,-0.06,-0.04,...,0.06,-0.03,-0.02,-0.03,-0.02,-0.04,-0.08,-0.03,-0.02,0.04


In [13]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x13799a6f0>

In [26]:
def minsearch_vector_search(vector, course):
    return vindex.search(vector, filter_dict={"course": course}, num_results=5)


def question_text_vector(q):
    question = q["question"]
    course = q["course"]

    v_q = pipeline.transform([question])

    return minsearch_vector_search(v_q, course)

In [27]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(q)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [16]:
hit_rate(relevance_total), mrr(relevance_total)

(0.48173762697212014, 0.3572833369353793)

In [17]:
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

X = pipeline.fit_transform(texts)

In [18]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x13819ac00>

In [19]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(q)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [20]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8210503566025502, 0.6717347453353508)

## `qdrant` search
First start up Docker of Qdrant server:
```bash
$ docker pull qdrant/qdrant

$ docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
```

In [22]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")  # connecting to local Qdrant instance

In [42]:
from fastembed import TextEmbedding
# TextEmbedding.list_supported_models()

In [29]:
import json

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-zh-v1.5",
  "sources": {
    "hf": "Qdrant/bge-small-zh-v1.5",
    "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.09,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "Qdrant/clip-ViT-B-32-text",
  "sources": {
    "hf": "Qdrant/clip-ViT-B-32-text",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model.onnx",
  "description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
  "license": "mit",
  "size_in_GB": 0.25,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "jinaai/jina-embeddings-v2-small-e

In [30]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5

In [36]:
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

In [31]:
# Define the collection name
collection_name = "zoomcamp-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE,  # Distance metric for similarity search
    ),
)

True

In [41]:
# texts

In [40]:
points = []
id = 0

for doc in texts:
    point = models.PointStruct(
        id=id,
        vector=models.Document(
            text=doc, model=model_handle
        ),  # embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc,
        },  # save all needed metadata fields
    )
    points.append(point)

    id += 1

In [56]:
client.upsert(collection_name=collection_name, points=points)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [58]:
def search(query, limit=5):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(  # embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query, model=model_handle
        ),
        limit=limit,  # top closest matches
        with_payload=True,  # to get metadata in the results
    )

    return results

In [82]:
search(ground_truth[0]["question"])

QueryResponse(points=[ScoredPoint(id=450, version=0, score=0.88004726, payload={'text': 'When does the next iteration start? The course is available in the self-paced mode too, so you can go through the materials at any time. But if you want to do it as a cohort with other students, the next iterations will happen in September 2023, September 2024 (and potentially other Septembers as well).'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=0, version=0, score=0.8777163, payload={'text': "Course - When will the course start? The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join th

In [80]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = search(q["question"], limit=limit).points
    relevance = [d.id == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [81]:
hit_rate(relevance_total), mrr(relevance_total)

(0.0, 0.0)