Evaluating retrieval performance:
- Ground truth: article id(s) per query
- Retrieval: article ids of closest chunks (e.g., top 5)


In [None]:
import sys

sys.path.append("../../.")

from qdrant_client import QdrantClient
# from evidently.ui.workspace import RemoteWorkspace
import requests
from src.utils.logger import get_logger
import pandas as pd
import mlflow


logger = get_logger(__name__)


TOP_K = 5
QDRANT_URL = "http://localhost:6333"
EMBEDDING_SVC_URL = "http://127.0.0.1:5002/invocations" # mlflow serving container endoint
# EMBEDDING_SVC_URL = "http://127.0.0.1:5002/invocations"
# EVIDENTLY_SVC_URL = "http://localhost:8080"
COLLECTION_NAME = "wixqa_corpus"
EVIDENTLY_PROJECT_NAME = "wixqa"
EXPERIMENT_NAME = "wixqa_expertwritten_retrieval_evaluation"
mlflow.set_tracking_uri("http://localhost:5000")


qdrant_client = QdrantClient(url=QDRANT_URL)
# ws = RemoteWorkspace(EVIDENTLY_SVC_URL)

# has_project = any(p.name == EVIDENTLY_PROJECT_NAME for p in ws.list_projects())
# if not has_project:
#     logger.info(f"Project {EVIDENTLY_PROJECT_NAME} does not exist. Creating...")
#     project = ws.create_project(name=EVIDENTLY_PROJECT_NAME)
#     project.save()
#     logger.info(f"Project '{project.name}' created with ID: {project.id}")
#     project = ws.get_project(project.id)
#     logger.info(f"Connected to project.")

# else:
#     logger.info(f"Found existing project with name: {EVIDENTLY_PROJECT_NAME}. Connecting...")
#     project_id = ws.search_project(EVIDENTLY_PROJECT_NAME)[0].id
#     project = ws.get_project(project_id)
#     logger.info(f"Connected to project.")


In [66]:
test_payload = {"inputs": ["test question"]}
response = requests.post("http://127.0.0.1:5002/invocations", json=test_payload)

print("Status Code:", response.status_code)
print("Response Headers:", dict(response.headers))
print("Response JSON:", response.json())

Status Code: 200
Response Headers: {'Server': 'nginx/1.18.0 (Ubuntu)', 'Date': 'Tue, 18 Nov 2025 15:38:17 GMT', 'Content-Type': 'application/json', 'Content-Length': '8435', 'Connection': 'keep-alive'}
Response JSON: {'predictions': [[0.07691620290279388, 0.07863236963748932, 0.002304134890437126, -0.0027673717122524977, -0.028416574001312256, 0.0070444317534565926, 0.02709631621837616, -0.030327336862683296, -0.08358553797006607, -0.020160071551799774, 0.004390736576169729, 0.005814699921756983, 0.02132382057607174, -0.02795727364718914, -0.04505575820803642, -0.03480257838964462, 0.05129023641347885, -0.0650331974029541, -0.052468959242105484, 0.04503953829407692, 0.002085263142362237, -0.08273077011108398, 0.011817128397524357, 0.04277825728058815, -0.027271779254078865, -0.0242447666823864, 0.030407987534999847, -0.07369127124547958, 0.020599601790308952, -0.11890428513288498, -0.016544774174690247, 0.0281376950442791, 0.044500116258859634, -0.0004500160866882652, 0.075122632086277

## 1. Load Evaluation Dataset
The dataset (csv) was created in `00_create_eval_dataset_wixqa.ipynb`

In [56]:
dataset_df = pd.read_csv("wixqa/wixqa_expertwritten_eval_dataset.csv")
logger.info(f"Loaded evaluation dataset with {len(dataset_df)} records.")

INFO:__main__:Loaded evaluation dataset with 200 records.


In [67]:
dataset_df.head(2)

Unnamed: 0,inputs,outputs,expectations,tags,source_type,source_id,created_time,dataset_record_id
0,{'question': 'I want to completely remove the ...,{},"{'expected_response': "" To completely remove ...",{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-005306a1cff042748027fe8b59e188be
1,{'question': 'Im having trouble changing the b...,{},{'expected_response': 'To change the browser t...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-0167e516c5b94df1acc467ca65d579d1


## 2. Prepare Retrieval Result

In [None]:
# process the dataset_df as needed for evaluation
# 1. Extract the list of questions:
questions = dataset_df["inputs"].apply(lambda x: eval(x)["question"]).tolist()

# 2. Extract the ground truth doc IDs:
ground_truth_doc_ids = dataset_df["expectations"].apply(lambda x: eval(x)["ground_truth_doc_ids"]).tolist()

In [80]:
# a simple wrapper function to get tretrieved doc IDs for a question:
def get_retrieved_doc_ids(
    question: str,
    top_k: int,
    embedding_svc_url: str,
    qdrant_client: QdrantClient,
    collection_name: str,
) -> list[str]:
    """ 
    Given a question, retrieve the top-k relevant document IDs from Qdrant.

    Parameters:
        question (str): The input question.
        top_k (int): The number of top relevant documents to retrieve.
        embedding_svc_url (str): The URL of the embedding service.
        qdrant_client (QdrantClient): The Qdrant client instance.
        collection_name (str): The name of the Qdrant collection.

    Returns:
        list[str]: A list of retrieved document IDs.
    """
    # Get the embedding vector for the question
    query_payload = {"inputs": [question]}
    resp = requests.post(embedding_svc_url, json=query_payload)
    resp.raise_for_status()
    query_vector = resp.json()["predictions"][0]

    # Perform the search in Qdrant to retrieve the top-k points
    # retrieved_points = qdrant_client.search(
    #     collection_name=collection_name, query_vector=query_vector, limit=top_k
    # )
    retrieved_points = qdrant_client.query_points(
        collection_name=collection_name, query=query_vector, limit=top_k
    )
    # Extract the document IDs from the retrieved points
    retrieved_doc_ids = [point.payload["id"] for point in retrieved_points.points]
    return retrieved_doc_ids

In [None]:
# test retrieval for 1 question
query_text = questions[0]

get_retrieved_doc_ids(
    question=query_text,
    top_k=TOP_K,
    embedding_svc_url=EMBEDDING_SVC_URL,
    qdrant_client=qdrant_client,
    collection_name=COLLECTION_NAME,
)

['1b3576560308047b4da6d139c7a2b7fae34e1e33a4b4f76f3d27d9f755a8bbfd',
 '6e858d81fc430adaf5f4fab45bad597e1503298103a5a7c39d641a1f0514e0ff',
 '3e848d96437f61a4a4f5e792e83445ce65e6ae2656531f9fe18103a4aaa8ec5d',
 '40993784887839063874e292a2260f936c4b8b2a7388ee817976a69c4e09f9e4',
 '3e848d96437f61a4a4f5e792e83445ce65e6ae2656531f9fe18103a4aaa8ec5d']

In [None]:
# retrieve for all questions in the dataset
retrieved_doc_ids = [
    get_retrieved_doc_ids(
        question=query,
        top_k=TOP_K,
        embedding_svc_url=EMBEDDING_SVC_URL,
        qdrant_client=qdrant_client,
        collection_name=COLLECTION_NAME,
    )
    for query in questions
]

  retrieved_points = qdrant_client.search(


In [60]:
# create a dataframe to hold the evaluation results
eval_results_df = pd.DataFrame({
    "question": questions,
    "ground_truth_doc_ids": ground_truth_doc_ids,
    "retrieved_doc_ids": retrieved_doc_ids
})

In [61]:
eval_results_df

Unnamed: 0,question,ground_truth_doc_ids,retrieved_doc_ids
0,I want to completely remove the login bar and ...,[21b52231b7ae64ad316d4558d1972af4e12c9986a8ba0...,[1b3576560308047b4da6d139c7a2b7fae34e1e33a4b4f...
1,Im having trouble changing the browser tab tit...,[61aecb220fb63759dba4322f38aee10901bd2b2e1634d...,[61aecb220fb63759dba4322f38aee10901bd2b2e1634d...
2,My automated emails triggered by form submissi...,[91eeefd48d7dcfda197a6162b7259e942579b5e6a53f5...,[91eeefd48d7dcfda197a6162b7259e942579b5e6a53f5...
3,I would like to change my billing cycle from 2...,[c5d82d4a0741f072da98db371148bfb349e37561fd94b...,[e4ca27f7596f17c593c3b87e29d09b1bd68b614c03ea7...
4,I need help setting up an email for website me...,[63feacd4199e7305e277fb785d3ee7a86252a753036d6...,[0aa8796a12152b1024403536cd9a9f6de6d4e578b0c6f...
...,...,...,...
195,Im trying to connect my business Instagram acc...,[73962a6c69a291d8a5d108b0a2116dd29341275b37c15...,[e794c331975103a7ef80b7c31f7a0cf21c29503579b88...
196,How does the pricing for Google Ads work?,[7e10270b7ecd05d2573050a9ca7541e71edcb7b2b8222...,[9f92f6e032be8a52696829d20938ea84ccc56996acf98...
197,I am trying to cancel my Premium subscription ...,[ac52db95f0cfec00e71b43d37c7428fb694f9bc262cd7...,[7976ad058cde513007f8a45a06bef2f6532cde4ad8dfd...
198,I am inquiring about the steps to publish my w...,[2a73ce56cfb6d899bd155abfa319d863360de062a21d8...,[2a73ce56cfb6d899bd155abfa319d863360de062a21d8...


## 3. Evaluate retrieval performance with mlflow.models.evaluate()

In [64]:
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run() as run:
    evaluate_results = mlflow.models.evaluate(
        data=eval_results_df,
        model_type="retriever",
        targets="ground_truth_doc_ids",
        predictions="retrieved_doc_ids",
        evaluators="default",
    )

2025/11/18 23:29:53 INFO mlflow.tracking.fluent: Experiment with name 'wixqa_expertwritten_retrieval_evaluation' does not exist. Creating a new experiment.
  token_count(),
  toxicity(),
  flesch_kincaid_grade_level(),
  ari_grade_level(),
  precision_at_k(retriever_k),
  recall_at_k(retriever_k),
  ndcg_at_k(retriever_k),
2025/11/18 23:29:54 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run fortunate-steed-170 at: http://localhost:5000/#/experiments/2/runs/8a6ee1e871374fcb9f19fb8bf653b849
üß™ View experiment at: http://localhost:5000/#/experiments/2
