In [30]:
%load_ext autoreload
%autoreload 2

In [43]:
import os
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv
from datetime import datetime

import qdrant_client
import gutenbergpy.textget
import opik

from charles_dicken_qa_chatbot.constants import *

# from IPython.display import Markdown, display
from llama_index.core import Settings, Document, global_handler, set_global_handler
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [44]:
nest_asyncio.apply()

today = datetime.now().strftime("%Y-%m-%d")

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["OPIK_PROJECT_NAME"] = os.getenv(
    "OPIK_PROJECT_NAME", f"charles-dicken-qa-{today}"
)

opik.configure(api_key=os.getenv("OPIK_API_KEY"))
set_global_handler("opik")
opik_callback_handler = global_handler

opik_client = opik.Opik()

OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/hmnguyen1067/.opik.config


In [45]:
# embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)
embed_model = OpenAIEmbedding()
llm = OpenAI(model=LLM_MODEL)

Settings.embed_model = embed_model
Settings.llm = llm

In [46]:
client = qdrant_client.QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=120)
aclient = qdrant_client.AsyncQdrantClient(
    host=QDRANT_HOST, port=QDRANT_PORT, timeout=120
)

vector_store = QdrantVectorStore(
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    fastembed_sparse_model=SPARSE_EMBEDDING_MODEL,
    collection_name=COLLECTION_NAME,
)

In [47]:
path = "../data/test.csv"
df = pd.read_csv(path)
df

Unnamed: 0,Gutenberg ID,Title
0,46,A Christmas Carol


In [48]:
from llama_index.readers.wikipedia import WikipediaReader

reader = WikipediaReader()

docs = []

for _, row in df.iterrows():
    book_id = row["Gutenberg ID"]
    book_title = row["Title"]
    book_text = (
        gutenbergpy.textget.get_text_by_id(book_id)
        .decode("utf-8")
        .replace("\r\n", "\n")
    )
    wiki_doc = reader.load_data(pages=[book_title])
    docs.extend(
        [
            Document(text=book_text, metadata={"title": book_title, "source": "book"}),
            Document(
                text=wiki_doc[0].text,
                metadata={"title": book_title, "source": "wikipedia"},
            ),
        ]
    )

In [49]:
from llama_index.core.extractors import (
    KeywordExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import (
    TokenTextSplitter,
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

#
# splitter = TokenTextSplitter(
#     chunk_size=512,
#     chunk_overlap=128,
#     separator=" ",
# )

# Semantic splitter
# splitter = SemanticSplitterNodeParser(
#     buffer_size=1,
#     breakpoint_percentile_threshold=75,
#     embed_model=embed_model,
# )

# Sentence splitter
splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=64,
    include_metadata=True,
)

# Late chunking
# JinaAI

# QuestionsAnsweredExtractor(questions=3)
# summary_extractor = SummaryExtractor(summaries=["prev", "self"])
# keyword_extractor = KeywordExtractor(keywords=10)

In [50]:
from llama_index.core.ingestion import (
    IngestionPipeline,
    IngestionCache,
    DocstoreStrategy,
)
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.index_store.redis import RedisIndexStore


redis_docstore = RedisDocumentStore.from_host_and_port(
    host=REDIS_HOST, port=REDIS_PORT, namespace=COLLECTION_NAME
)
redis_indexstore = RedisIndexStore.from_host_and_port(
    host=REDIS_HOST, port=REDIS_PORT, namespace=COLLECTION_NAME
)
redis_cache = IngestionCache(
    cache=RedisCache.from_host_and_port(host=REDIS_HOST, port=REDIS_PORT),
    collection=COLLECTION_NAME,
)

In [51]:
pipeline = IngestionPipeline(
    transformations=[splitter, embed_model],
    vector_store=vector_store,
    docstore=redis_docstore,
    cache=redis_cache,
    docstore_strategy=DocstoreStrategy.UPSERTS,
)

nodes = await pipeline.arun(documents=docs, in_place=True, show_progress=True)

In [52]:
from llama_index.core import StorageContext, VectorStoreIndex

storage_context = StorageContext.from_defaults(
    docstore=redis_docstore, vector_store=vector_store, index_store=redis_indexstore
)

In [53]:
index = VectorStoreIndex(
    nodes=nodes,
    use_async=True,
    storage_context=storage_context,
    embed_model=embed_model,
)

In [None]:
# from llama_index.core import load_indices_from_storage

# re_index = load_indices_from_storage(storage_context=storage_context)

## Evaluation dataset generation

In [81]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

eval_llm = OpenAI(model=LLM_MODEL, temperature=0.1)

dataset_generator = RagDatasetGenerator(
    nodes,
    llm=eval_llm,
    show_progress=True,
    num_questions_per_chunk=2,
)

In [82]:
rag_dataset = await dataset_generator.agenerate_dataset_from_nodes()

  0%|          | 0/122 [00:00<?, ?it/s]OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 1ada7f16-9e59-48f1-9c3c-05ecf9b6d9f8, event_type: CBEventType.RETRIEVE, event_id: 6b5f627d-481e-4570-b7be-8bbfd2764b3f.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: a901054d-fc51-4643-9c75-73d4b769218b, event_type: CBEventType.CHUNKING, event_id: 807b252b-a308-47bc-be4c-13059e57dc7c.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: a901054d-fc51-4643-9c75-73d4b769218b, event_type: CBEventType.CHUNKING, event_id: 8b317c50-19ae-4063-af58-a5ae1bdb61dc.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: c8d81606-27f3-4c0e-9c40-8463b760e0f3, eve

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=8,
    vector_store_query_mode="hybrid",
    hybrid_top_k=3,
    llm=llm,
)

# Opik

In [None]:
from opik.evaluation.metrics import (
    Hallucination,
    Usefulness,
    AnswerRelevance,
    ContextPrecision,
    ContextRecall,
)
from opik.evaluation import evaluate

In [83]:
opik_eval_name = (
    f"{COLLECTION_NAME}-eval-{len(rag_dataset.examples)}-{LLM_MODEL}-{today}"
)
dataset = opik_client.get_or_create_dataset(name=opik_eval_name)

OPIK: Created a "charles_dickens-eval-244-gpt-5-nano-2025-09-25" dataset at https://www.comet.com/opik/api/v1/session/redirect/datasets/?dataset_id=01998176-3cbd-70b1-85dc-eb00f97c4793&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==.


In [84]:
dataset.insert_from_pandas(rag_dataset.to_pandas())

In [86]:
dataset.to_pandas()

Unnamed: 0,reference_answer,query,query_by,reference_contexts,reference_answer_by,id
0,LibriVox,"According to the External links section, name ...",ai (gpt-5-nano),[== See also ==\n\nChristmas horror\nDickens C...,ai (gpt-5-nano),01998176-8c08-7d06-8c42-bf5fe6b07b05
1,Dickens Christmas fair; List of Christmas-them...,Name two items listed in the See also section ...,ai (gpt-5-nano),[== See also ==\n\nChristmas horror\nDickens C...,ai (gpt-5-nano),01998176-8c07-7866-b27b-96e752e9b82f
2,- Shift 1: Early 20th century transformation i...,- Question 2 (analytical): Explain two major s...,ai (gpt-5-nano),"[Ruth Glancy, the professor of English literat...",ai (gpt-5-nano),01998176-8c06-7539-9119-08626e069bc0
3,- Robert Louis Stevenson\n- An American busine...,- Question 1 (factual): Name two individuals o...,ai (gpt-5-nano),"[Ruth Glancy, the professor of English literat...",ai (gpt-5-nano),01998176-8c05-7d7f-a5b5-9c4193870c31
4,- William Dean Howells\n - View: Howells anal...,Analytical: Compare the critiques of A Christm...,ai (gpt-5-nano),"[== Legacy ==\n\nThe phrase ""Merry Christmas"" ...",ai (gpt-5-nano),01998176-8c04-7a8e-a3e1-0e7c1ef735ca
...,...,...,...,...,...,...
239,"- ""tight-fisted hand at the grind-stone"" \n ...",Identify and quote at least five descriptive p...,ai (gpt-5-nano),[The firm was known as\nScrooge and Marley. So...,ai (gpt-5-nano),01998176-8b19-7db0-8858-45df3941a7c3
240,Dickens uses humor and literary allusion in th...,The narrator uses humorous asides and allusion...,ai (gpt-5-nano),[Old Marley was as dead as a\ndoor-nail.\n\nMi...,ai (gpt-5-nano),01998176-8b18-706a-af3e-ddf46c102ea3
241,They are depicted as lifelong business partner...,Provide a concise description of Scrooge and M...,ai (gpt-5-nano),[Old Marley was as dead as a\ndoor-nail.\n\nMi...,ai (gpt-5-nano),01998176-8b17-75c3-8346-b277343cdb69
242,"""MARLEY was dead: to begin with.""",Multiple choice: Stave I opens by stating Marl...,ai (gpt-5-nano),[The Project Gutenberg eBook of A Christmas Ca...,ai (gpt-5-nano),01998176-8b16-713f-89ce-ed9a638bf8e9


In [None]:
hallucination_metric = Hallucination(model=LLM_MODEL)
usefulness_metric = Usefulness(model=LLM_MODEL)
answer_relevance_metric = AnswerRelevance(model=LLM_MODEL)
context_precision_metric = ContextPrecision(model=LLM_MODEL)
context_recall_metric = ContextRecall(model=LLM_MODEL)


@opik.track
def query_vector(query, query_engine):
    response = query_engine.query(query)
    return response


def evaluation_task(x, query_engine):
    return {
        "output": query_vector(x["query"], query_engine),
        "context": x["reference_contexts"],
        "expected_output": x["reference_answer"],
    }


def make_task(query_engine):
    def _task(x):
        return evaluation_task(x, query_engine)

    return _task


task = make_task(query_engine)

evaluation = evaluate(
    dataset=dataset,
    task=task,
    scoring_metrics=[
        hallucination_metric,
        usefulness_metric,
        answer_relevance_metric,
        context_precision_metric,
        context_recall_metric,
    ],
    scoring_key_mapping={"input": "query"},
    experiment_config={"rag": "base"},
)

# Retriever Evaluator

In [12]:
dataset = opik_client.get_or_create_dataset(name=f"eval-dataset-{today}-v2")

In [None]:
dataset.insert_from_pandas(rag_dataset.to_pandas())

In [12]:
from charles_dicken_qa_chatbot.utils import *
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [13]:
qa_dataset = create_eval_dataset(nodes, llm=llm)

100%|██████████| 122/122 [23:08<00:00, 11.38s/it]


In [14]:
qa_dataset.save_json("this.json")

In [13]:
qa_dataset = EmbeddingQAFinetuneDataset.from_json("this.json")

In [None]:
# nodes = set_node_ids(nodes)

In [54]:
similarity_top_k = 3
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L4-v2", top_n=similarity_top_k
)

In [55]:
embedding_retriever = create_embedding_retriever(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model,
    similarity_top_k=similarity_top_k,
)
embedding_retriever_results = await retrieval_results(embedding_retriever, qa_dataset)

In [56]:
display_results("Embedding Retriever", embedding_retriever_results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Embedding Retriever,0.721311,0.61612,0.240437,0.721311,0.61612,0.643244


In [57]:
bm25_retriever = create_bm25_retriever(
    nodes,
    similarity_top_k=similarity_top_k,
)
bm25_retriever_results = await retrieval_results(bm25_retriever, qa_dataset)

OPIK: Started logging traces to the "charles-dicken-qa-2025-09-25" project at https://www.comet.com/opik/api/v1/session/redirect/projects/?trace_id=019980ca-cc30-7f90-973c-693ab5503ee5&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==.


In [58]:
display_results("BM25 Retriever", bm25_retriever_results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,BM25 Retriever,0.737705,0.669399,0.245902,0.737705,0.669399,0.686961


In [59]:
embedding_bm25_retriever_rerank = EmbeddingBM25RerankerRetriever(
    embedding_retriever, bm25_retriever, reranker=reranker
)
embedding_bm25_retriever_rerank_results = await retrieval_results(
    embedding_bm25_retriever_rerank, qa_dataset
)

OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 5067e34c-7272-4f13-b9e8-d08cc030e76a, event_type: CBEventType.RETRIEVE, event_id: 55d2c221-f8c3-46ab-8f7b-03222e6e7096.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 55d2c221-f8c3-46ab-8f7b-03222e6e7096, event_type: CBEventType.EMBEDDING, event_id: 6e79e3fc-5157-4ec9-af2f-dd9adc1ceaec.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: a909bbb1-175e-47f9-ab07-a7dc329ec543, event_type: CBEventType.RETRIEVE, event_id: b6a94a5e-b359-472b-8a50-62ed2fea6d49.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: b6a94a5e-b359-472b-8a50-62ed2fea6d49, event_type: CBEventType.EMBEDDING, event_

In [60]:
display_results(
    "Embedding + BM25 Retriever + Reranker",
    embedding_bm25_retriever_rerank_results,
)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Embedding + BM25 Retriever + Reranker,0.778689,0.670082,0.358607,0.778689,0.670082,0.69823


In [61]:
query_engine = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=8,
    vector_store_query_mode="hybrid",
    hybrid_top_k=3,
    llm=llm,
)

In [62]:
bm42_retriever = query_engine.retriever
bm42_retriever_results = await retrieval_results(bm42_retriever, qa_dataset)

In [63]:
display_results(
    "Embedding + BM42 Retriever",
    bm42_retriever_results,
)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Embedding + BM42 Retriever,0.643443,0.583333,0.214481,0.643443,0.583333,0.598944


In [67]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

In [70]:
from llama_index.core.response_synthesizers import ResponseMode

In [None]:
response_synthesizer = get_response_synthesizer()
ensemble_query_engine = RetrieverQueryEngine(
    retriever=embedding_bm25_retriever_rerank,
    response_synthesizer=response_synthesizer,
)

In [71]:
response = ensemble_query_engine.query("What is 'A Christmas Carol' main theme?")

OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 45223ffd-975c-427c-baa2-df78a29d7382, event_type: CBEventType.RETRIEVE, event_id: 1727689d-6e8b-4759-b26a-13bf2572dc65.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 1727689d-6e8b-4759-b26a-13bf2572dc65, event_type: CBEventType.RETRIEVE, event_id: 07ec0a1c-9291-4ae8-8333-6a7c1d7d15a0.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: 1727689d-6e8b-4759-b26a-13bf2572dc65, event_type: CBEventType.RERANKING, event_id: ea26ef41-6004-4937-9ffd-ad41148e1f36.
OPIK: No trace data found in context for event start. This is likely due to the fact that the trace is not started properly. The parent_id: c3881a6f-b83c-4047-a290-3a4d2bed0716, event_type: CBEventType.CHUNKING, event_i