In [1]:
import sys
sys.path.insert(0, '..')

In [None]:
import pandas as pd

df_ground_truth = pd.read_csv('./ground_truth_evidently.csv')

ground_truth = df_ground_truth.to_dict(orient='records')

In [5]:
import docs

raw_documents = docs.read_github_data()
documents = docs.parse_data(raw_documents)
chunks = docs.chunk_documents(documents)

In [6]:

from minsearch import Index
from typing import Any, Dict, List, TypedDict

index = Index(
    text_fields=["content", "filename", "title", "description"],
)

index.fit(chunks)

<minsearch.minsearch.Index at 0x78b1d0f79400>

In [7]:

class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    title: str
    description: str
    filename: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - title (str): The title of the matched document.
            - description (str): A short description of the document.
            - filename (str): The path or name of the source file.
    """
    return index.search(
        query=query,
        num_results=5,
    )

In [8]:
all_relevancies = []

for rec in ground_truth:
    filename = rec['filename']
    sr = search(rec['question'])
    relevancy = [r['filename'] == filename for r in sr]
    all_relevancies.append(relevancy)

In [9]:
count = 0 
mrr = 0

for relevancy in all_relevancies:
    for rank, r in enumerate(relevancy):
        if r == True:
            count = count + 1
            mrr = mrr + 1 / (rank + 1)
            break

count / len(all_relevancies), mrr / len(all_relevancies)

(0.4053452115812918, 0.3303266518188565)

In [10]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)

In [11]:
from tqdm.auto import tqdm 

def evaluate(
        ground_truth,
        search_function,
        question_column='question',
        id_column='filename'
):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q[id_column]
        results = search_function(q[question_column])
        relevance = [d[id_column] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [12]:
evaluate(ground_truth, search)

  0%|          | 0/449 [00:00<?, ?it/s]

{'hit_rate': 0.4053452115812918, 'mrr': 0.3303266518188565}

## Vector search

In [13]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [14]:
import numpy as np
from tqdm.auto import tqdm

embeddings = []

for d in tqdm(chunks):
    text = d.get('title', '') + ' ' + d.get('description', '') + ' ' + d.get('content', '')
    text = text.strip()
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/575 [00:00<?, ?it/s]

In [15]:
from minsearch import VectorSearch

In [16]:
vindex = VectorSearch()

In [17]:
vindex.fit(embeddings, chunks)

<minsearch.vector.VectorSearch at 0x78b0af4e58b0>

In [18]:
def vsearch(query: str) -> List[SearchResult]:
    q = embedding_model.encode(query)
    return vindex.search(
        query_vector=q,
        num_results=5,
    )

In [20]:
evaluate(ground_truth, vsearch)

  0%|          | 0/449 [00:00<?, ?it/s]

{'hit_rate': 0.7371937639198218, 'mrr': 0.5797327394209352}

## Hybrid Search

In [21]:
def hsearch(query: str) -> List[SearchResult]:
    return vsearch(query) + search(query)

In [22]:
evaluate(ground_truth, hsearch)

  0%|          | 0/449 [00:00<?, ?it/s]

{'hit_rate': 0.7817371937639198, 'mrr': 0.5864690493866439}