In [1]:
from dotenv import load_dotenv

load_dotenv()

In [2]:
import json

# load json
fileName = "data/20250612.json"

with open(fileName, "r", encoding="utf-8") as file:
    data = json.load(file)

In [3]:
from llama_index.core import Document

documents = []
for dp in data:
    documents.append(Document(
        text = dp["label"],
        doc_id = dp["id"],
        metadata = dp,
    ))
    

In [4]:
### Retriever

import nest_asyncio
nest_asyncio.apply()

In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

In [6]:
node_parser = SentenceSplitter(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

In [7]:
# by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

In [20]:
# OpenAI Embedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini")

embed_model = OpenAIEmbedding(embed_batch_size=10, model="text-embedding-3-small")
Settings.embed_model = embed_model #later should not use global varr


In [15]:
from llama_index.embeddings.openai import OpenAIEmbedding

# Replace 'your-embedding-model' with a valid embedding model available on your OpenAI endpoint
vector_index = VectorStoreIndex(nodes, embed_model=embed_model)
retriever = vector_index.as_retriever(similarity_top_k=5)

In [16]:
# Try retrieval
retrieved_nodes = retriever.retrieve("Nha khoa Parkway")

from llama_index.core.response.notebook_utils import display_source_node

for node in retrieved_nodes:
    display_source_node(node, source_length=1000)

**Node ID:** node_1038<br>**Similarity:** 0.7513302517515471<br>**Text:** Nha khoa Parkway<br>

**Node ID:** node_1055<br>**Similarity:** 0.4605765087929772<br>**Text:** Nha khoa Việt Pháp<br>

**Node ID:** node_733<br>**Similarity:** 0.4573456210590678<br>**Text:** Nha khoa Kim - Cơ sở Quận 2<br>

**Node ID:** node_347<br>**Similarity:** 0.4570525916335276<br>**Text:** Nha khoa NNS<br>

**Node ID:** node_760<br>**Similarity:** 0.45505509986383663<br>**Text:** Nha khoa Minh Châu (Nam Từ Liêm, Hà Nội)<br>

In [17]:
### Build an evaluation dataset of (query, context) pairs

from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)

In [21]:
qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=2
)

100%|██████████| 3105/3105 [2:18:01<00:00,  2.67s/it]     


In [28]:
queries = qa_dataset.queries.values()
# print(list(queries)[2])
for q in list(queries):
    print(q + '\n')

What services or treatments might be offered at Nha khoa Kim - CS Gò Vấp, considering it is a dental clinic?

In what ways could the location of Nha khoa Kim - CS Gò Vấp impact its accessibility for patients in the surrounding area?

What is the full name of the hospital mentioned in the context information?

In what type of healthcare facility is the "Bệnh viện Đa khoa Thủ Đô" categorized?

What is the name of the dental clinic mentioned in the context information, and in which city is it located?

Identify the specific branch of Nha khoa Hải Âu Sài Gòn that is referenced in the context.

What services or treatments might be offered at Nha khoa Thẩm mỹ Hoàng Gia, considering its focus on aesthetics in dentistry?

How does the branding of Nha khoa Thẩm mỹ Hoàng Gia reflect its commitment to cosmetic dental care?

What type of medical facility is Phòng khám Đa khoa Đại Đông, and what services might it typically offer to patients?

In what ways might Phòng khám Đa khoa Đại Đông contribut

In [29]:
### save json (optional)
qa_dataset.save_json("eval_dataset.json")


In [31]:
# [optional] load
qa_dataset = EmbeddingQAFinetuneDataset.from_json("eval_dataset.json")

In [None]:
### RetrieverEvaluator for Retrieval Evaluation

include_cohere_rerank = False

if include_cohere_rerank:
    !uv pip install cohere -q

In [None]:
from llama_index.core.evaluation import RetrieverEvaluator

metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

if include_cohere_rerank:
    metrics.append(
        "cohere_rerank_relevancy"  # requires COHERE_API_KEY environment variable to be set
    )

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [None]:
# try it out on a sample query
sample_id, sample_query = list(qa_dataset.queries.items())[0]
sample_expected = qa_dataset.relevant_docs[sample_id]

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

In [None]:
# try it out on an entire dataset
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
print(eval_results)

In [None]:
### Build Retriever with LlamaIndex
import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [None]:
display_results("top-2 eval", eval_results)

## Metrics Interpretation
Good reference: https://www.pinecone.io/learn/offline-evaluation/

### Core Metrics Explained

#### **Precision & Recall** 
Remember "recall thừa precision đủ" :D 
- **Recall**: How many relevant results are returned out of all relevant documents that exist
- **Precision**: How many returned results are actually relevant

**Example**: Query "How many Hieu in this team?" 
- Returns 5 results: [Hieu1, Hieu2, Hoai, Hieu3, Dinh Anh]
- Total Hieus in database: 3
- **Recall = 3/3 = 1.0** (100% - found all Hieus)
- **Precision = 3/5 = 0.6** (60% - 3 out of 5 results are relevant)

#### **Hit Rate (Hit Ratio)**
Measures whether at least one relevant document appears in the top-k results.
- **Formula**: Number of queries with ≥1 relevant result / Total queries
- **Range**: 0.0 to 1.0 (higher is better)
- **Use case**: Good for understanding if your system finds ANY relevant results

#### **MRR (Mean Reciprocal Rank)**
Measures how high the first relevant result appears in the ranking.
- **Formula**: Average of (1/rank_of_first_relevant_result) across all queries
- **Range**: 0.0 to 1.0 (higher is better)
- **Example**: 
  - Query 1: First relevant at rank 2 → RR = 1/2 = 0.5
  - Query 2: First relevant at rank 1 → RR = 1/1 = 1.0
  - **MRR = (0.5 + 1.0) / 2 = 0.75**

#### **AP (Average Precision)**
Considers both precision and the ranking position of relevant documents.
- **Formula**: Average of precision@k for each relevant document found
- **Range**: 0.0 to 1.0 (higher is better)
- **Use case**: Rewards systems that rank relevant docs higher

#### **NDCG (Normalized Discounted Cumulative Gain)**
Most sophisticated metric - considers relevance scores and ranking positions.
- **Formula**: DCG@k / IDCG@k (normalized against ideal ranking)
- **Range**: 0.0 to 1.0 (higher is better)
- **Use case**: Best for graded relevance (not just binary relevant/irrelevant)

### Applying to Our Healthcare Dataset

**Dataset Context**: Vietnamese healthcare facility names and addresses
- **Documents**: Structured data with facility names, types, and locations
- **Queries**: Generated questions about specific facilities
- **Challenge**: Vietnamese text, abbreviations (BV vs Bệnh Viện), case sensitivity

**Expected Performance Patterns**:
1. **High Hit Rate** (>0.8): Most queries should find at least one relevant facility
2. **Moderate MRR** (0.5-0.7): Exact matches might not always rank #1 due to text variations
3. **Variable Precision**: Depends on query specificity and facility name uniqueness
4. **Good Recall**: Vector embeddings should capture semantic similarity

**Optimization Strategies**:
- **Low Hit Rate**: Improve text preprocessing, consider hybrid search (BM25 + Vector)
- **Low MRR**: Fine-tune embedding model, adjust similarity thresholds
- **Low Precision**: Increase similarity threshold, reduce top-k results
- **Low Recall**: Decrease similarity threshold, increase top-k results