In [1]:
import os
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import qdrant_client
import gutenbergpy.textget

from charles_dicken_qa_chatbot.constants import *

# from IPython.display import Markdown, display
from llama_index.core import Settings, Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.openai import OpenAI

nest_asyncio.apply()

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register

os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "http://localhost:6006"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)
Settings.embed_model = embed_model

llm = OpenAI(model=LLM_MODEL)

In [3]:
client = qdrant_client.QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
aclient = qdrant_client.AsyncQdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

vector_store = QdrantVectorStore(
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm25",
    collection_name=COLLECTION_NAME,
)

In [4]:
path = "../data/test.csv"
df = pd.read_csv(path)
df

Unnamed: 0,Gutenberg ID,Title
0,46,A Christmas Carol


In [6]:
from llama_index.readers.wikipedia import WikipediaReader

reader = WikipediaReader()

docs = []

for _, row in df.iterrows():
    book_id = row["Gutenberg ID"]
    book_title = row["Title"]
    book_text = (
        gutenbergpy.textget.get_text_by_id(book_id)
        .decode("utf-8")
        .replace("\r\n", "\n")
    )
    wiki_doc = reader.load_data(pages=[book_title])
    docs.extend(
        [
            Document(text=book_text, metadata={"title": book_title, "source": "book"}),
            Document(
                text=wiki_doc[0].text,
                metadata={"title": book_title, "source": "wikipedia"},
            ),
        ]
    )

In [7]:
from llama_index.core.extractors import KeywordExtractor, SummaryExtractor
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    separator=" ",
)

summary_extractor = SummaryExtractor(summaries=["prev", "self"])
keyword_extractor = KeywordExtractor(keywords=10)

In [8]:
from llama_index.core.ingestion import (
    IngestionPipeline,
    IngestionCache,
    DocstoreStrategy,
)
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore

redis_docstore = RedisDocumentStore.from_host_and_port(
    host=REDIS_HOST, port=REDIS_PORT, namespace=COLLECTION_NAME
)
redis_cache = IngestionCache(
    cache=RedisCache.from_host_and_port(host=REDIS_HOST, port=REDIS_PORT),
    collection=COLLECTION_NAME,
)

pipeline = IngestionPipeline(
    transformations=[text_splitter, keyword_extractor, summary_extractor, embed_model],
    vector_store=vector_store,
    docstore=redis_docstore,
    cache=redis_cache,
    docstore_strategy=DocstoreStrategy.UPSERTS,
)

In [9]:
nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

100%|██████████| 137/137 [00:27<00:00,  5.01it/s]
100%|██████████| 137/137 [00:39<00:00,  3.43it/s]
Generating embeddings: 100%|██████████| 137/137 [00:18<00:00,  7.51it/s]


## Evaluation dataset generation

In [10]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

eval_llm = OpenAI(model="gpt-4.1-mini", temperature=0.1)

dataset_generator = RagDatasetGenerator(
    nodes[:20],
    llm=eval_llm,
    show_progress=True,
    num_questions_per_chunk=2,
)

In [11]:
rag_dataset = dataset_generator.generate_dataset_from_nodes()

100%|██████████| 20/20 [00:08<00:00,  2.42it/s]
100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
100%|██████████| 2/2 [00:04<00:00,  2.31s/it]
100%|██████████| 2/2 [00:03<00:00,  1.52s/it]
100%|██████████| 2/2 [00:03<00:00,  1.69s/it]
100%|██████████| 2/2 [00:05<00:00,  2.57s/it]
100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
100%|██████████| 2/2 [00:03<00:00,  1.97s/it]
100%|██████████| 2/2 [00:03<00:00,  1.88s/it]
100%|██████████| 2/2 [00:03<00:00,  1.90s/it]
100%|██████████| 2/2 [00:04<00:00,  2.07s/it]
100%|██████████| 2/2 [00:06<00:00,  3.24s/it]
100%|██████████| 2/2 [00:03<00:00,  2.00s/it]
100%|██████████| 2/2 [00:02<00:00,  1.35s/it]
100%|██████████| 2/2 [00:06<00:00,  3.08s/it]
100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
100%|██████████| 2/2 [00:05<00:00,  2.68s/it]
100%|██████████| 2/2 [00:04<00:00,  2.26s/it]
100%|██████████| 2/2 [00:03<00:00,  1.85s/it]
100%|██████████| 2/2 [00:05<00:00,  2.82s/it]
100%|██████████| 2/2 [00:04<00:00,  2.24s/it]


In [12]:
df = rag_dataset.to_pandas()

In [13]:
df.columns

Index(['query', 'reference_contexts', 'reference_answer',
       'reference_answer_by', 'query_by'],
      dtype='object')

In [None]:
from phoenix.client import Client
from datetime import datetime

formatted_datetime = datetime.now().strftime("%m-%d/%H:%M")

px_client = Client()
dataset = px_client.datasets.create_dataset(
    dataframe=df,
    name=f"charles-dicken-qa-eval-{formatted_datetime}",
    input_keys=["query", "reference_contexts"],
    output_keys=["reference_answer"],
)

In [39]:
latest_px_dataset_name = px_client.datasets.list()[0]["name"]
latest_dataset = px_client.datasets.get_dataset(dataset=latest_px_dataset_name)

In [43]:
latest_dataset.to_dataframe()

Unnamed: 0_level_0,input,output,metadata
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RGF0YXNldEV4YW1wbGU6NDE=,{'query': 'What is the primary intention Charl...,"{'reference_answer': 'In the preface of ""A Chr...",{}
RGF0YXNldEV4YW1wbGU6NDI=,"{'query': 'According to the excerpt from ""A Ch...",{'reference_answer': 'According to the excerpt...,{}
RGF0YXNldEV4YW1wbGU6NDM=,{'query': 'Explain the significance of Marley'...,{'reference_answer': 'In the opening stave of ...,{}
RGF0YXNldEV4YW1wbGU6NDQ=,{'query': 'Describe Scrooge's relationship to ...,{'reference_answer': 'In the excerpt from *A C...,{}
RGF0YXNldEV4YW1wbGU6NDU=,{'query': 'How does the description of Scrooge...,{'reference_answer': 'The description of Scroo...,{}
RGF0YXNldEV4YW1wbGU6NDY=,{'query': 'What role does the confirmation of ...,{'reference_answer': 'The confirmation of Marl...,{}
RGF0YXNldEV4YW1wbGU6NDc=,{'query': 'How does the author use weather ima...,{'reference_answer': 'The author uses weather ...,{}
RGF0YXNldEV4YW1wbGU6NDg=,{'query': 'Describe the social interactions (o...,"{'reference_answer': 'In the passage, Scrooge ...",{}
RGF0YXNldEV4YW1wbGU6NDk=,{'query': 'How does the description of the wea...,{'reference_answer': 'The description of the w...,{}
RGF0YXNldEV4YW1wbGU6NTA=,{'query': 'Contrast the attitudes of Scrooge a...,{'reference_answer': 'In the excerpt from *A C...,{}
